diff --git a/notebooks/1.0-mb-data-preprocessing-mem-reduce.ipynb b/notebooks/1.0-mb-data-preprocessing-mem-reduce.ipynb
index 19a7907a..9291c322 100644
--- a/notebooks/1.0-mb-data-preprocessing-mem-reduce.ipynb
+++ b/notebooks/1.0-mb-data-preprocessing-mem-reduce.ipynb
@@ -18,6 +18,8 @@
    },
    "outputs": [],
    "source": [
+    "import os\n",
+    "\n",
     "import gcsfs\n",
     "import google.auth\n",
     "import modin.config as cfg\n",
@@ -28,8 +30,6 @@
     "cfg.Engine.put(\"dask\")\n",
     "ProgressBar.enable()\n",
     "\n",
-    "import os\n",
-    "\n",
     "import wandb\n",
     "from tqdm.auto import tqdm\n"
    ]
@@ -91,7 +91,7 @@
     "    \"\"\"\n",
     "    create a dataframe and optimize its memory usage.\n",
     "\n",
-    "    I. e., apply some optimizations i.e, manual inference of dtypes, pre-selection\n",
+    "    I.e., apply some optimizations i.e, manual inference of dtypes, pre-selection\n",
     "    of unique columns and chunking to enable import.\n",
     "\n",
     "    Adapted from here:\n",
@@ -201,19 +201,19 @@
    "outputs": [],
    "source": [
     "def df_to_parquet(\n",
-    "    df: pd.DataFrame, target_dir: str, chunk_size: int = 1000000, **parquet_wargs\n",
+    "    x: pd.DataFrame, target_dir: str, chunk_size: int = 1000000, **parquet_wargs\n",
     ") -> None:\n",
     "    \"\"\"\n",
     "    Write pd.DataFrame to parquet format.\n",
     "\n",
     "    Args:\n",
-    "        df (pd.DataFrame): input dataframe.\n",
+    "        x (pd.DataFrame): input dataframe.\n",
     "        target_dir (str): local directory where parquet files are written to.\n",
     "        chunk_size (int, optional): number of rows stored in one chunk of parquet file.\n",
     "        Defaults to 1000000.\n",
     "    \"\"\"\n",
-    "    for i in tqdm(range(0, len(df), chunk_size)):\n",
-    "        slc = df.iloc[i : i + chunk_size]\n",
+    "    for i in tqdm(range(0, len(x), chunk_size)):\n",
+    "        slc = x.iloc[i : i + chunk_size]\n",
     "        chunk = int(i / chunk_size)\n",
     "        output_path = (\n",
     "            target_dir\n",
diff --git a/notebooks/2.0a-mb-data-preprocessing-supervised.ipynb b/notebooks/2.0a-mb-data-preprocessing-supervised.ipynb
index 6ca0c158..0716ca5d 100644
--- a/notebooks/2.0a-mb-data-preprocessing-supervised.ipynb
+++ b/notebooks/2.0a-mb-data-preprocessing-supervised.ipynb
@@ -19,8 +19,6 @@
     "from catboost import CatBoostClassifier, Pool\n",
     "from numpy.testing import assert_almost_equal\n",
     "from pandas._testing.asserters import assert_almost_equal\n",
-    "from sklearn.metrics import roc_auc_score\n",
-    "from sklearn.model_selection import cross_val_predict\n",
     "from tqdm.auto import tqdm\n",
     "\n",
     "sys.path.append(\"..\")\n"
@@ -104,9 +102,6 @@
     "    for i in range(0, max_i)\n",
     "]\n",
     "\n",
-    "# asks = [f\"ASK_{i}\" for i in range(1, 17)]\n",
-    "# bids = [f\"BID_{i}\" for i in range(1, 17)]\n",
-    "\n",
     "columns = [\n",
     "    \"QUOTE_DATETIME\",\n",
     "    \"ROOT\",\n",
@@ -129,8 +124,6 @@
     "    \"price_ex_lag\",\n",
     "    \"issue_type\",\n",
     "    \"myn\",\n",
-    "    # *asks,\n",
-    "    # *bids,\n",
     "    \"buy_sell\",\n",
     "]\n",
     "\n",
@@ -156,23 +149,6 @@
     "df.memory_usage(deep=True).sum()\n"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/",
-     "height": 739
-    },
-    "id": "gHnFz65rZnOZ",
-    "outputId": "e5d3b5d7-3e01-4483-ba82-97793c5ebc68",
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "df.head().T\n"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -268,6 +244,7 @@
    "outputs": [],
    "source": [
     "# indices\n",
+    "train_range, val_range, test_range = None, None, None\n",
     "\n",
     "if EXCHANGE == \"ise\" and STRATEGY == \"supervised\":\n",
     "    train_range = df.QUOTE_DATETIME.between(\n",
diff --git a/notebooks/2.0b-mb-data-preprocessing-unsupervised.ipynb b/notebooks/2.0b-mb-data-preprocessing-unsupervised.ipynb
index 5ecc9961..3fa34286 100644
--- a/notebooks/2.0b-mb-data-preprocessing-unsupervised.ipynb
+++ b/notebooks/2.0b-mb-data-preprocessing-unsupervised.ipynb
@@ -12,7 +12,6 @@
     "import os\n",
     "\n",
     "import gcsfs\n",
-    "import numpy as np\n",
     "import pandas as pd\n",
     "import wandb\n",
     "\n",
@@ -161,7 +160,7 @@
    },
    "outputs": [],
    "source": [
-    "df.head()"
+    "df.head()\n"
    ]
   },
   {
@@ -187,7 +186,7 @@
    },
    "outputs": [],
    "source": [
-    "labelled_df.head()"
+    "labelled_df.head()\n"
    ]
   },
   {
@@ -227,7 +226,7 @@
     "    \"bid_ex\",\n",
     "    \"bid_size_ex\",\n",
     "    \"ask_size_ex\",\n",
-    "    # FIXME is different for loballed and unlabelled trades\n",
+    "    # Can be different for lobelled and unlabelled trades:\n",
     "    # 'optionid','issue_type', 'myn',\n",
     "    # 'price_all_lead', 'price_all_lag',\n",
     "    # 'price_ex_lead', 'price_ex_lag',\n",
@@ -266,7 +265,7 @@
    },
    "outputs": [],
    "source": [
-    "labelled_df['index_labelled'] = labelled_df.index"
+    "labelled_df[\"index_labelled\"] = labelled_df.index\n"
    ]
   },
   {
@@ -307,10 +306,10 @@
     "        \"bid_ex\",\n",
     "        \"bid_size_ex\",\n",
     "        \"ask_size_ex\",\n",
-    "        # myn seems to be different\n",
-    "        #'issue_type', 'optionid',\n",
-    "        # 'price_all_lead', 'price_all_lag', # FIXME is different for loballed and unlabelled trades\n",
-    "        # 'price_ex_lead', 'price_ex_lag', # FIXME is different for loballed and unlabelled trades\n",
+    "        # myn seems to be different for labelled and unlabelled trades\n",
+    "        # 'issue_type', 'optionid',\n",
+    "        # 'price_all_lead', 'price_all_lag',\n",
+    "        # 'price_ex_lead', 'price_ex_lag',\n",
     "    ],\n",
     "    how=\"left\",\n",
     "    indicator=\"exists\",\n",
@@ -326,7 +325,7 @@
    },
    "outputs": [],
    "source": [
-    "df_w_indicator.head(50)"
+    "df_w_indicator.head(50)\n"
    ]
   },
   {
@@ -339,7 +338,7 @@
    "source": [
     "# interpolate missing indices. index increases 1 -> 2. So filling with float seems ok. will be inserted between int of labelled df.\n",
     "df_w_indicator[\"index_labelled\"].interpolate(\"linear\", inplace=True)\n",
-    "df_w_indicator.set_index(keys = \"index_labelled\", drop=True, inplace=True)"
+    "df_w_indicator.set_index(keys=\"index_labelled\", drop=True, inplace=True)\n"
    ]
   },
   {
@@ -409,63 +408,6 @@
     "df_w_indicator[df_w_indicator[\"exists\"] == \"both\"].head(20).T\n"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Analysis of Accucacies"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "# foo = df_w_indicator[df_w_indicator[\"exists\"] == \"both\"][\n",
-    "#     [\"price_ex_lag_labelled\", \"price_ex_lag_unlabelled\", \"buy_sell\", \"TRADE_PRICE\"]\n",
-    "# ]\n",
-    "\n",
-    "# foo[\"tick_unlabelled\"] = np.where(\n",
-    "#     foo[\"TRADE_PRICE\"] > foo[\"price_ex_lag_unlabelled\"],\n",
-    "#     1,\n",
-    "#     np.where(foo[\"TRADE_PRICE\"] < foo[\"price_ex_lag_unlabelled\"], -1, np.nan),\n",
-    "# )\n",
-    "# foo[\"tick_labelled\"] = np.where(\n",
-    "#     foo[\"TRADE_PRICE\"] > foo[\"price_ex_lag_labelled\"],\n",
-    "#     1,\n",
-    "#     np.where(foo[\"TRADE_PRICE\"] < foo[\"price_ex_lag_labelled\"], -1, np.nan),\n",
-    "# )\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "# foo.head()\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "# acc_unlabelled = (foo.buy_sell == foo.tick_unlabelled).sum() / len(foo)\n",
-    "# acc_labelled = (foo.buy_sell == foo.tick_labelled).sum() / len(foo)\n",
-    "\n",
-    "# print(acc_unlabelled)\n",
-    "# print(acc_labelled)\n"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -481,10 +423,10 @@
    "source": [
     "# use last 6 months. May increase later\n",
     "date_range = df_w_indicator.QUOTE_DATETIME.between(\n",
-    "        \"2013-04-24 00:00:00\", \"2013-10-24 16:14:48\"\n",
+    "    \"2013-04-24 00:00:00\", \"2013-10-24 16:14:48\"\n",
     ")\n",
     "\n",
-    "df_w_indicator = df_w_indicator[date_range]"
+    "df_w_indicator = df_w_indicator[date_range]\n"
    ]
   },
   {
diff --git a/notebooks/3.0a-mb-explanatory-data-analysis.ipynb b/notebooks/3.0a-mb-explanatory-data-analysis.ipynb
index 469b1859..a2f5cf11 100644
--- a/notebooks/3.0a-mb-explanatory-data-analysis.ipynb
+++ b/notebooks/3.0a-mb-explanatory-data-analysis.ipynb
@@ -8,6 +8,8 @@
    },
    "outputs": [],
    "source": [
+    "from __future__ import annotations\n",
+    "\n",
     "import os\n",
     "import random\n",
     "import warnings\n",
@@ -16,9 +18,7 @@
     "\n",
     "warnings.simplefilter(action=\"ignore\", category=FutureWarning)\n",
     "\n",
-    "from __future__ import annotations\n",
-    "\n",
-    "from typing import Any, List, Tuple\n",
+    "from typing import Any, List\n",
     "\n",
     "import matplotlib.pyplot as plt\n",
     "import numpy as np\n",
diff --git a/notebooks/3.0b-mb-explanatory-matched-unmatched.ipynb b/notebooks/3.0b-mb-explanatory-matched-unmatched.ipynb
index 3d9e9b0c..fecb786f 100644
--- a/notebooks/3.0b-mb-explanatory-matched-unmatched.ipynb
+++ b/notebooks/3.0b-mb-explanatory-matched-unmatched.ipynb
@@ -7,32 +7,25 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "from __future__ import annotations\n",
+    "\n",
     "import wandb\n",
-    "import sys\n",
-    "import warnings\n",
     "from pathlib import Path\n",
-    "import optuna\n",
     "import pandas as pd\n",
     "\n",
     "import os\n",
-    "import sys\n",
     "\n",
     "from otc.features.build_features import (\n",
-    "    features_categorical,\n",
-    "    features_classical,\n",
     "    features_classical_size,\n",
-    "    features_ml,\n",
     ")\n",
     "\n",
-    "from __future__ import annotations\n",
     "\n",
-    "from typing import Any, List, Tuple\n",
+    "from typing import List\n",
     "\n",
     "import matplotlib.pyplot as plt\n",
     "import numpy as np\n",
     "import pandas as pd\n",
     "import seaborn as sns\n",
-    "from scipy import stats\n",
     "\n",
     "from tqdm.auto import tqdm"
    ]
diff --git a/notebooks/3.0c-feature-engineering.ipynb b/notebooks/3.0c-feature-engineering.ipynb
index 466f2f3d..b219279c 100644
--- a/notebooks/3.0c-feature-engineering.ipynb
+++ b/notebooks/3.0c-feature-engineering.ipynb
@@ -17,16 +17,15 @@
     "\n",
     "import gcsfs\n",
     "import google.auth\n",
+    "import matplotlib.pyplot as plt\n",
     "import numpy as np\n",
-    "import numpy.typing as npt\n",
     "import pandas as pd\n",
     "import wandb\n",
-    "from catboost import CatBoostClassifier, Pool\n",
     "from sklearn.exceptions import NotFittedError\n",
-    "from sklearn.metrics import matthews_corrcoef\n",
-    "from sklearn.model_selection import train_test_split\n",
-    "from sklearn.preprocessing import (OrdinalEncoder, PowerTransformer,\n",
-    "                                   RobustScaler, StandardScaler)"
+    "\n",
+    "from sklearn.preprocessing import (OrdinalEncoder, StandardScaler)\n",
+    "\n",
+    "from scipy.stats import ks_2samp"
    ]
   },
   {
@@ -610,175 +609,12 @@
     "run.finish()\n"
    ]
   },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "H8u4au_6XAMx"
-   },
-   "source": [
-    "## Adversarial Validation\n",
-    "> Adversarial Validation is a technique allowing you to easily estimate the degree of difference between your training and test data. This technique was long rumored among Kaggle participants and transmitted from team to team until it emerged publicly thanks to a post by Zygmunt Zając (https://www.kaggle.com/zygmunt) on his FastML blog. (adapted from Banchawicz et. al)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "features_classical = [\n",
-    "    \"TRADE_PRICE\",\n",
-    "    \"bid_ex\",\n",
-    "    \"ask_ex\",\n",
-    "    \"BEST_ASK\",\n",
-    "    \"BEST_BID\",\n",
-    "    \"price_ex_lag\",\n",
-    "    \"price_ex_lead\",\n",
-    "    \"price_all_lag\",\n",
-    "    \"price_all_lead\",\n",
-    "    \"chg_ex_lead\",\n",
-    "    \"chg_ex_lag\",\n",
-    "    \"chg_all_lead\",\n",
-    "    \"chg_all_lag\",\n",
-    "    \"prox_ex\",\n",
-    "    \"prox_best\",\n",
-    "]\n",
-    "\n",
-    "features_size = [\n",
-    "    \"bid_ask_size_ratio_ex\",\n",
-    "    \"rel_bid_size_ex\",\n",
-    "    \"rel_ask_size_ex\",\n",
-    "    \"TRADE_SIZE\",\n",
-    "    \"bid_size_ex\",\n",
-    "    \"ask_size_ex\",\n",
-    "    \"depth_ex\",\n",
-    "]\n",
-    "\n",
-    "features_classical_size = [\n",
-    "    *features_classical,\n",
-    "    *features_size,\n",
-    "    \"buy_sell\",  # add here and remove later\n",
-    "]\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "train = pd.read_parquet(\n",
-    "    \"gs://thesis-bucket-option-trade-classification/data/ise_log_standardized/train_set_60.parquet\",\n",
-    "    engine=\"fastparquet\",\n",
-    "    columns=features_classical_size,\n",
-    ")\n",
-    "val = pd.read_parquet(\n",
-    "    \"gs://thesis-bucket-option-trade-classification/data/ise_log_standardized/val_set_20.parquet\",\n",
-    "    engine=\"fastparquet\",\n",
-    "    columns=features_classical_size,\n",
-    ")\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "id": "GKpvTE4EXAM0",
-    "outputId": "222f6a61-548f-4f1c-8064-caffcdfe637e"
-   },
-   "outputs": [],
-   "source": [
-    "X = pd.concat([train, val])\n",
-    "X.drop(columns=[\"buy_sell\"], inplace=True)\n",
-    "# assign zeros to train set and ones to test set\n",
-    "y = [0] * len(train) + [1] * len(val)\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "X.columns\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "id": "tperioc56aCt"
-   },
-   "outputs": [],
-   "source": [
-    "# perform cv with catboost classifier\n",
-    "clf = CatBoostClassifier(\n",
-    "    task_type=\"GPU\",\n",
-    "    logging_level=\"Silent\",\n",
-    "    random_seed=42,\n",
-    "    eval_metric=\"Accuracy\",\n",
-    ")\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "X_train, X_test, y_train, y_test = train_test_split(\n",
-    "    X, y, test_size=0.2, random_state=42, shuffle=True\n",
-    ")\n",
-    "clf.fit(X_train, y_train, eval_set=(X_test, y_test))\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "y_pred = clf.predict(X_test)\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# use mcc as data is imbalanced 3/4 train set, 1/4 val set\n",
-    "print(matthews_corrcoef(y_test, y_pred))\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "feature_importance = clf.get_feature_importance(\n",
-    "    prettified=True, type=\"FeatureImportance\"\n",
-    ")\n",
-    "feature_importance\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "feature_importance.to_csv(\"feature_importance_gbm_classical_size.csv\")\n"
-   ]
-  },
   {
    "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Kolmogorov Smirnov"
+    "## Kolmogorov Smirnov Test"
    ]
   },
   {
@@ -787,8 +623,6 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from scipy.stats import ks_2samp\n",
-    "\n",
     "cols = train.columns.tolist()\n",
     "# cols.remove(\"buy_sell\")\n",
     "results = []\n",
@@ -807,16 +641,9 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Auto-Correlation"
+    "## Auto-Correlation Of Features"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
   {
    "cell_type": "code",
    "execution_count": null,
diff --git a/notebooks/9.0d-mb-adv_val.ipynb b/notebooks/3.0d-mb-adv_val.ipynb
similarity index 91%
rename from notebooks/9.0d-mb-adv_val.ipynb
rename to notebooks/3.0d-mb-adv_val.ipynb
index e029b957..d9bdf0c8 100644
--- a/notebooks/9.0d-mb-adv_val.ipynb
+++ b/notebooks/3.0d-mb-adv_val.ipynb
@@ -1,5 +1,12 @@
 {
  "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "> Adversarial Validation is a technique allowing you to easily estimate the degree of difference between your training and test data. This technique was long rumored among Kaggle participants and transmitted from team to team until it emerged publicly thanks to a post by Zygmunt Zając (https://www.kaggle.com/zygmunt) on his FastML blog. (adapted from Banchawicz et. al)"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -13,7 +20,6 @@
     "import sys\n",
     "from pathlib import Path\n",
     "\n",
-    "import numpy as np\n",
     "import pandas as pd\n",
     "import wandb\n",
     "from catboost import CatBoostClassifier, Pool\n",
@@ -123,7 +129,7 @@
     "id": "zMIOV1jA_ImH"
    },
    "source": [
-    "## CatBoost Baseline 🐈‍⬛"
+    "## CatBoost"
    ]
   },
   {
diff --git a/notebooks/4.0a-mb-logistic-regression.ipynb b/notebooks/4.0a-mb-logistic-regression.ipynb
index 0282a923..ee278a31 100644
--- a/notebooks/4.0a-mb-logistic-regression.ipynb
+++ b/notebooks/4.0a-mb-logistic-regression.ipynb
@@ -9,12 +9,11 @@
    },
    "outputs": [],
    "source": [
-    "import os, glob\n",
+    "import os\n",
     "import sys\n",
     "\n",
     "import math\n",
     "from pathlib import Path\n",
-    "from typing import List, Optional\n",
     "\n",
     "import numpy as np\n",
     "import pandas as pd\n",
@@ -22,7 +21,13 @@
     "import wandb\n",
     "from torch import nn\n",
     "from torch import nn, optim\n",
-    "from tqdm.auto import tqdm\n"
+    "from tqdm.auto import tqdm\n",
+    "\n",
+    "sys.path.append(\"..\")\n",
+    "from otc.data.dataset import TabDataset\n",
+    "from otc.data.dataloader import TabDataLoader\n",
+    "from otc.features.build_features import features_classical_size\n",
+    "from otc.optim.early_stopping import EarlyStopping"
    ]
   },
   {
@@ -34,9 +39,7 @@
    },
    "outputs": [],
    "source": [
-    "os.environ[\"GCLOUD_PROJECT\"] = \"flowing-mantis-239216\"\n",
-    "# fs = gcsfs.GCSFileSystem(project=\"thesis\")\n",
-    "# fs_prefix = \"gs://\"\n"
+    "os.environ[\"GCLOUD_PROJECT\"] = \"flowing-mantis-239216\""
    ]
   },
   {
@@ -55,44 +58,6 @@
     "data_dir = artifact.download()\n"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "4cf7b797-6256-45c0-9bec-d4faf98d9daa",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "sys.path.append(\"..\")\n",
-    "from otc.data.dataset import TabDataset\n",
-    "from otc.data.dataloader import TabDataLoader\n",
-    "from otc.features.build_features import features_classical, features_classical_size\n",
-    "from otc.optim.early_stopping import EarlyStopping\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "bd424255-737f-4590-93ee-e9e6dcfc3258",
-   "metadata": {},
-   "source": [
-    "https://arxiv.org/pdf/2106.11959.pdf\n",
-    "\n",
-    "Layer count 3\n",
-    "Feature embedding size 192\n",
-    "Head count 8\n",
-    "Activation & FFN size factor (ReGLU,\n",
-    "4/3)\n",
-    "Attention dropout 0.2\n",
-    "FFN dropout 0.1\n",
-    "Residual dropout 0.0\n",
-    "Initialization Kaiming (He et al., 2015a)\n",
-    "Parameter count 929K The value is given for 100 numerical features\n",
-    "Optimizer AdamW\n",
-    "Learning rate 1e−4\n",
-    "Weight decay 1e−5 0.0 for Feature Tokenizer, LayerNorm and biases\n"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -102,8 +67,7 @@
    },
    "outputs": [],
    "source": [
-    "# preserve relative ordering, sample for testing ache\n",
-    "\n",
+    "# preserve relative ordering, sample for testing ace\n",
     "frac = 1\n",
     "\n",
     "# sample\n",
@@ -111,7 +75,7 @@
     "y_train = X_train[\"buy_sell\"]\n",
     "X_train = X_train[features_classical_size]\n",
     "\n",
-    "X_val = pd.read_parquet(Path(data_dir, \"val_set.parquet\"), engine=\"fastparquet\").sample(frac=frac)# .sample(frac=frac, random_state=42).sort_index()\n",
+    "X_val = pd.read_parquet(Path(data_dir, \"val_set.parquet\"), engine=\"fastparquet\").sample(frac=frac)\n",
     "y_val = X_val[\"buy_sell\"]\n",
     "X_val = X_val[features_classical_size]\n",
     "\n",
@@ -197,30 +161,6 @@
     ")\n"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "99743844-ba1e-4a9c-8650-8ce14ca48385",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "def checkpoint(model, filename):\n",
-    "    \n",
-    "    # remove old files\n",
-    "    for filename in glob.glob(f\"checkpoints/{run.id}*\"):\n",
-    "        os.remove(filename) \n",
-    "    \n",
-    "    # create_dir\n",
-    "    dir_checkpoints = \"checkpoints/\"\n",
-    "    os.makedirs(dir_checkpoints, exist_ok = True) \n",
-    "    \n",
-    "    # save new file\n",
-    "    print(\"saving new checkpoints.\")\n",
-    "    torch.save(model.state_dict(), os.path.join(dir_checkpoints,f\"{run.id}*\"))"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -296,7 +236,6 @@
     "    # correct samples / no samples\n",
     "    val_accuracy = correct / len(X_val)\n",
     "    if best_accuracy < val_accuracy:\n",
-    "        checkpoint(clf, f\"checkpoints/{run.id}-{step}.ptx\")\n",
     "        best_accuracy = val_accuracy\n",
     "        best_step = step\n",
     "    \n",
@@ -315,30 +254,6 @@
     "        break\n"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "51ca46d5-7448-4e63-8d93-ff3773fe5753",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "cp =  glob.glob(f\"checkpoints/{run.id}*\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "4ca09c3b-6598-4a6f-be55-ce0cf7de61d1",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "clf.load_state_dict(torch.load(cp[0]))"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -351,12 +266,9 @@
     "y_pred, y_true = [], []\n",
     "\n",
     "for x_cat, x_cont, weights, targets in test_loader:\n",
-    "    # logits = clf(x_cont,x_cat).flatten() #\n",
-    "    # for my implementation\n",
     "    logits = clf(x_cat, x_cont).flatten()\n",
     "    logits = logits.flatten()\n",
     "\n",
-    "\n",
     "    # map between zero and one, sigmoid is otherwise included in loss already\n",
     "    # https://stackoverflow.com/a/66910866/5755604\n",
     "    preds = torch.sigmoid(logits.squeeze())\n",
@@ -367,17 +279,10 @@
     "y_pred = np.rint(np.concatenate(y_pred))\n",
     "y_true = np.concatenate(y_true)\n",
     "\n",
+    "# calculate accuracy on validation set\n",
     "acc = (y_pred == y_true).sum() / len(y_true)\n",
     "print(acc)"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "4c882d2a-6cad-4ff2-af4a-a2b58518cab3",
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
diff --git a/notebooks/4.0c-mb-feature-importances.ipynb b/notebooks/4.0c-mb-feature-importances.ipynb
index b4f1d1c7..4c371385 100644
--- a/notebooks/4.0c-mb-feature-importances.ipynb
+++ b/notebooks/4.0c-mb-feature-importances.ipynb
@@ -34,23 +34,15 @@
     "import matplotlib.pyplot as plt\n",
     "import matplotlib as mpl\n",
     "from matplotlib import rc\n",
-    "import matplotlib.dates as mdates\n",
-    "import matplotlib.ticker as ticker\n",
-    "from matplotlib.ticker import FormatStrFormatter, StrMethodFormatter, PercentFormatter\n",
-    "\n",
     "\n",
     "import pandas as pd\n",
-    "import seaborn as sns\n",
-    "import sklearn\n",
-    "from sklearn.metrics import accuracy_score\n",
-    "from sklearn.ensemble import HistGradientBoostingClassifier\n",
     "import torch\n",
     "from torch import nn\n",
     "\n",
     "sys.path.append(\"..\")\n",
     "from otc.models.classical_classifier import ClassicalClassifier\n",
     "\n",
-    "from sage import GroupedMarginalImputer, PermutationEstimator, MarginalImputer\n",
+    "from sage import GroupedMarginalImputer, PermutationEstimator\n",
     "\n",
     "from otc.features.build_features import (\n",
     "    features_categorical,\n",
@@ -59,13 +51,9 @@
     "    features_ml,\n",
     ")\n",
     "\n",
-    "from otc.models.fttransformer import FeatureTokenizer, FTTransformer, Transformer\n",
-    "from otc.models.activation import ReGLU\n",
     "from otc.data.dataset import TabDataset\n",
     "from otc.data.dataloader import TabDataLoader\n",
     "from otc.features.build_features import features_classical_size\n",
-    "from otc.optim.early_stopping import EarlyStopping\n",
-    "from otc.optim.scheduler import CosineWarmupScheduler\n",
     "\n",
     "import wandb\n",
     "from tqdm.auto import tqdm"
@@ -740,20 +728,7 @@
     "            39343128, 39343165, 39343193, 39343199, 39343211, 39343215,\n",
     "            39343234, 39343242, 39343298, 39343346, 39343370, 39343390,\n",
     "            39343412, 39343413, 39343415, 39343414, 39343426, 39343433,\n",
-    "            39343465, 39343464, 39343485, 39343498]\n",
-    "\n",
-    "#  inside\n",
-    "# idx = [39342190, 39342187, 39342186, 39342184, 39342183, 39342182,\n",
-    "#             39342172, 39342180, 39342178, 39342177, 39342176, 39342173,\n",
-    "#             39342181, 39342232, 39342230, 39342226, 39342228, 39342227,\n",
-    "#             39342235, 39342224, 39342236, 39342242, 39342245, 39342246,\n",
-    "#             39342247, 39342250, 39342223, 39342195, 39342196, 39342197,\n",
-    "#             39342198, 39342203, 39342201, 39342207, 39342206, 39342213,\n",
-    "#             39342217, 39342210, 39342209, 39342270, 39342272, 39342267,\n",
-    "#             39342266, 39342264, 39342262, 39342268, 39342260, 39342261,\n",
-    "#             39342251, 39342253, 39342252, 39342258, 39342259, 39342255,\n",
-    "#             39342284, 39342283, 39342282, 39342280, 39342275, 39342278,\n",
-    "#             39342274, 39342279, 39342294, 39342293]"
+    "            39343465, 39343464, 39343485, 39343498]"
    ]
   },
   {
@@ -1251,21 +1226,13 @@
     "with open(file_path, 'wb') as file:\n",
     "    pickle.dump(data, file)"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "2f0e85cd-f735-4f25-b254-57a15343845d",
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "myenv",
+   "display_name": "Python 3",
    "language": "python",
-   "name": "myenv"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
@@ -1277,7 +1244,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.10"
+   "version": "3.9.4"
   }
  },
  "nbformat": 4,
diff --git a/notebooks/4.0e-mb-fttransformer-pretraining.ipynb b/notebooks/4.0e-mb-fttransformer-pretraining.ipynb
index b90bfabf..9b5e1ec7 100644
--- a/notebooks/4.0e-mb-fttransformer-pretraining.ipynb
+++ b/notebooks/4.0e-mb-fttransformer-pretraining.ipynb
@@ -8,18 +8,14 @@
    "source": [
     "import glob\n",
     "import os\n",
-    "import math\n",
     "from pathlib import Path\n",
-    "import sys\n",
     "\n",
-    "import numpy as np\n",
     "import pandas as pd\n",
     "\n",
     "from tqdm.auto import tqdm\n",
     "\n",
     "import torch\n",
     "import torch.nn as nn\n",
-    "import torch.nn.functional as F\n",
     "import torch.optim as optim\n",
     "\n",
     "import wandb\n",
@@ -34,7 +30,7 @@
     "\n",
     "from otc.data.dataset import TabDataset\n",
     "from otc.data.dataloader import TabDataLoader\n",
-    "from otc.features.build_features import features_classical, features_classical_size\n",
+    "from otc.features.build_features import features_classical_size\n",
     "from otc.optim.early_stopping import EarlyStopping\n",
     "from otc.optim.scheduler import CosineWarmupScheduler"
    ]
diff --git a/notebooks/5.0a-mb-batch-size-finder.ipynb b/notebooks/5.0a-mb-batch-size-finder.ipynb
index 39254d6d..40b478e1 100644
--- a/notebooks/5.0a-mb-batch-size-finder.ipynb
+++ b/notebooks/5.0a-mb-batch-size-finder.ipynb
@@ -7,24 +7,18 @@
    "outputs": [],
    "source": [
     "from time import sleep\n",
-    "from typing import Optional, Tuple\n",
+    "from typing import Optional\n",
     "\n",
     "import torch\n",
     "import torch.nn as nn\n",
     "import torch.nn.functional as F\n",
-    "import torch.optim as optim\n",
     "from otc.data.dataloader import TabDataLoader\n",
     "from otc.models.activation import ReGLU\n",
     "from otc.models.fttransformer import (\n",
-    "    CategoricalFeatureTokenizer,\n",
-    "    CLSToken,\n",
     "    FeatureTokenizer,\n",
     "    FTTransformer,\n",
-    "    MultiheadAttention,\n",
-    "    NumericalFeatureTokenizer,\n",
     "    Transformer,\n",
     ")\n",
-    "from tqdm import tqdm\n",
     "\n",
     "import os\n",
     "\n",
@@ -75,7 +69,6 @@
     "    max_batch_size: Optional[int] = None,\n",
     "    num_iterations: int = 5,\n",
     ") -> int:\n",
-    "    # print(model)\n",
     "    model.to(device)\n",
     "    model.train(True)\n",
     "    optimizer = torch.optim.AdamW(model.parameters())\n",
@@ -186,14 +179,14 @@
     "\n",
     "    model = FTTransformer(feature_tokenizer, transformer)\n",
     "\n",
-    "    batch_size = get_batch_size(\n",
+    "    get_batch_size(\n",
     "        model=model,\n",
     "        device=device,\n",
     "        min_batch_size=32,\n",
     "        max_batch_size=1024 * 1024,\n",
     "    )\n",
     "\n",
-    "    batch_size = get_batch_size(\n",
+    "    get_batch_size(\n",
     "        model=model,\n",
     "        device=device,\n",
     "        min_batch_size=32,\n",
diff --git a/notebooks/6.0a-mb-results-fttransformer.ipynb b/notebooks/6.0a-mb-results-fttransformer.ipynb
index e53e319d..f456233a 100644
--- a/notebooks/6.0a-mb-results-fttransformer.ipynb
+++ b/notebooks/6.0a-mb-results-fttransformer.ipynb
@@ -240,7 +240,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Get and save embeddings"
+    "## Get and save embeddings 💤"
    ]
   },
   {
diff --git a/notebooks/6.0b-mb-results-classical-rules.ipynb b/notebooks/6.0b-mb-results-classical-rules.ipynb
index 337dd6d5..b2bc11b4 100644
--- a/notebooks/6.0b-mb-results-classical-rules.ipynb
+++ b/notebooks/6.0b-mb-results-classical-rules.ipynb
@@ -13,7 +13,6 @@
     "from pathlib import Path\n",
     "\n",
     "import pandas as pd\n",
-    "from sklearn.metrics import accuracy_score\n",
     "import wandb\n",
     "from tqdm.auto import tqdm\n",
     "\n",
@@ -158,7 +157,7 @@
     "        (\"depth\", \"best\"),\n",
     "        (\"depth\", \"ex\"),\n",
     "        (\"rev_tick\", \"all\"),\n",
-    "    ],  # p. 13 grauer (benchmark 2) \n",
+    "    ],  # grauer (benchmark 2) \n",
     "]\n",
     "\n",
     "# generate names for array\n",
@@ -182,7 +181,6 @@
     "    clf = ClassicalClassifier(layers=rule, random_state=seed, strategy=\"none\")\n",
     "    # fit is only used to set sklearn attributes, no leakage\n",
     "    clf.fit(X=X_test.head(5), y=y_test.head(5))\n",
-    "    # print(f\"{rule}: {clf.score(X_test, y_test)}\")\n",
     "    result = clf.predict(X_test).astype(int)\n",
     "    results.append(result)\n"
    ]
diff --git a/notebooks/6.0c-mb-results-universal.ipynb b/notebooks/6.0c-mb-results-universal.ipynb
index 37943230..5c4b22c3 100644
--- a/notebooks/6.0c-mb-results-universal.ipynb
+++ b/notebooks/6.0c-mb-results-universal.ipynb
@@ -426,6 +426,7 @@
     "    c_1_0 = np.where((Y_[model_1] != Y_[ground_truth]) & (Y_[model_2] == Y_[ground_truth]), 1, 0).sum()\n",
     "    c_1_1 = np.where((Y_[model_1] != Y_[ground_truth]) & (Y_[model_2] != Y_[ground_truth]), 1, 0).sum()\n",
     "    \n",
+    "    #  [both right, gbm right/transformer wrong, gbm wrong/transformer right, both wrong]\n",
     "    contingency_table = [[c_0_0, c_0_1],[c_1_0, c_1_1]]\n",
     "\n",
     "    return np.array(contingency_table)\n",
diff --git a/notebooks/6.0d-mb-results-gradient-boosting.ipynb b/notebooks/6.0d-mb-results-gradient-boosting.ipynb
index d18fff3a..31fcec96 100644
--- a/notebooks/6.0d-mb-results-gradient-boosting.ipynb
+++ b/notebooks/6.0d-mb-results-gradient-boosting.ipynb
@@ -135,7 +135,7 @@
     "id": "zMIOV1jA_ImH"
    },
    "source": [
-    "## CatBoost Baseline 🐈‍⬛"
+    "## CatBoost🐈‍⬛"
    ]
   },
   {
diff --git a/notebooks/6.0e-mb-viz-universal.ipynb b/notebooks/6.0e-mb-viz-universal.ipynb
index 65a8e419..9a5f9358 100644
--- a/notebooks/6.0e-mb-viz-universal.ipynb
+++ b/notebooks/6.0e-mb-viz-universal.ipynb
@@ -9,25 +9,33 @@
    },
    "outputs": [],
    "source": [
+    "import json\n",
+    "import os\n",
+    "import pickle\n",
+    "from pathlib import Path\n",
+    "\n",
     "import numpy as np\n",
+    "import pandas as pd\n",
     "import matplotlib as mpl\n",
     "import matplotlib.pyplot as plt\n",
     "from matplotlib import rc\n",
-    "import torch\n",
-    "import pandas as pd\n",
+    "\n",
     "import matplotlib.dates as mdates\n",
     "from matplotlib.dates import DateFormatter\n",
     "import matplotlib.ticker as ticker\n",
-    "from matplotlib.ticker import FormatStrFormatter, StrMethodFormatter, PercentFormatter,MaxNLocator\n",
+    "from matplotlib.ticker import StrMethodFormatter, PercentFormatter,MaxNLocator\n",
     "\n",
-    "import json\n",
-    "import os\n",
-    "import pickle\n",
-    "from pathlib import Path\n",
-    "import optuna\n",
-    "import wandb\n",
+    "from sklearn import datasets\n",
+    "from sklearn.svm import SVC\n",
+    "from sklearn.semi_supervised import SelfTrainingClassifier\n",
+    "from sklearn.tree import DecisionTreeRegressor\n",
     "\n",
-    "os.environ[\"GCLOUD_PROJECT\"] = \"flowing-mantis-239216\""
+    "import torch\n",
+    "from torch import optim\n",
+    "from torch import nn\n",
+    "\n",
+    "import optuna\n",
+    "import wandb"
    ]
   },
   {
@@ -39,6 +47,8 @@
    },
    "outputs": [],
    "source": [
+    "os.environ[\"GCLOUD_PROJECT\"] = \"flowing-mantis-239216\"\n",
+    "\n",
     "params = {\n",
     "    \"pgf.texsystem\": \"xelatex\",\n",
     "    \"pgf.rcfonts\": False,\n",
@@ -54,9 +64,8 @@
     "plt.rc('text.latex', preamble=r'\\usepackage{amsmath}\\usepackage[utf8]{inputenc}')\n",
     "\n",
     "CM = 1 / 2.54\n",
-    "# cmap = plt.cm.get_cmap(\"viridis\")\n",
     "cmap = mpl.colormaps.get_cmap(\"plasma\")\n",
-    "# plt.style.use(['science','nature'])\n",
+    "\n",
     "\n",
     "# Bright color scheme\n",
     "# color-blind safe\n",
@@ -83,7 +92,6 @@
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -165,7 +173,6 @@
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -270,52 +277,6 @@
    ]
   },
   {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Distribution of Log-Loss in Gradient Boosting"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "losses = pd.read_parquet(\"gs://thesis-bucket-option-trade-classification/data/results/ise_gbm_supervised_test_viz_dist_loss-viz-dist-loss.parquet\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "fig, ax = plt.subplots(figsize=(14*CM,6*CM))\n",
-    "\n",
-    "bins = np.linspace(0, 6, 100)\n",
-    "\n",
-    "ax.hist(losses.iloc[:,0],bins=bins, histtype=\"step\", label=\"Iteration 5\");\n",
-    "ax.hist(losses.iloc[:,1],bins=bins, histtype=\"step\", label=\"Iteration 100\");\n",
-    "ax.hist(losses.iloc[:,2],bins=bins, histtype=\"step\", label=\"Iteration 1{,}000\");\n",
-    "ax.hist(losses.iloc[:,3],bins=bins, histtype=\"step\", label=\"Iteration 2{,}000\");\n",
-    "\n",
-    "fig.legend(frameon=False, loc=\"lower center\", ncols=4, bbox_to_anchor=(0.5, -0.07))\n",
-    "\n",
-    "ax.yaxis.set_major_formatter(StrMethodFormatter('{x:,.0f}'))\n",
-    "ax.set_ylabel(\"Count\")\n",
-    "ax.set_xlabel(\"Log Loss Per Sample (Val)\")\n",
-    "\n",
-    "ax.annotate('Outliers', xy=(5, 0), xytext=(5, 200000), arrowprops=dict(arrowstyle=\"->\", connectionstyle=\"arc3\"))\n",
-    "\n",
-    "plt.tight_layout()\n",
-    "\n",
-    "plt.savefig(\"../reports/Graphs/gbm-loss-distribution.pdf\", bbox_inches=\"tight\")"
-   ]
-  },
-  {
-   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -421,7 +382,6 @@
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -483,6 +443,8 @@
     "# ax.xaxis.set_major_formatter(StrMethodFormatter('{x:,.0f}'))\n",
     "axes[0].xaxis.set_major_formatter(DateFormatter('%b %Y'))\n",
     "\n",
+    "ylim = axes[0].get_ylim()\n",
+    "\n",
     "axes[0].set_xlim(pd.to_datetime(\"2005-05-02\"), pd.to_datetime(\"2017-10-31\"))\n",
     "\n",
     "plt.legend([\"_\",\"$\\operatorname{tick}_{\\mathrm{all}}$\", \"$\\operatorname{quote}_{\\mathrm{nbbo}}$\",\"$\\operatorname{gsu}_{\\mathrm{small}}$\",\"$\\operatorname{gsu}_{\\mathrm{large}}$\"],frameon=False, loc = \"lower center\", bbox_to_anchor=(0.5, -0.5), ncols=4)\n",
@@ -523,11 +485,15 @@
     "# axes[0].set_title(\"ISE\")\n",
     "# axes[1].set_title(\"CBOE\")\n",
     "\n",
+    "print(ylim)\n",
+    "\n",
+    "axes.invert_yaxis()\n",
+    "\n",
     "axes.set_xlim(pd.to_datetime(\"2005-05-02\"), pd.to_datetime(\"2017-10-31\"))\n",
-    "axes.set_ylim([0,20.0])\n",
+    "axes.set_ylim([100-lim*100 for lim in ylim])\n",
     "axes.yaxis.set_major_formatter(PercentFormatter(100.0,decimals=2))\n",
     "axes.xaxis.set_major_formatter(DateFormatter('%b %Y'))\n",
-    "axes.set_ylabel(\"Fraction\")\n",
+    "axes.set_ylabel(\"Percentage\")\n",
     "\n",
     "labels = [\"_\",\"_\",\"At Mid (ISE)\", \"At Mid (CBOE)\"]\n",
     "\n",
@@ -592,7 +558,6 @@
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "metadata": {
     "id": "vVE2JK9Af5gW"
@@ -633,7 +598,6 @@
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "metadata": {
     "id": "h9mAHJU1f5gX"
@@ -651,7 +615,6 @@
    },
    "outputs": [],
    "source": [
-    "from sklearn.tree import DecisionTreeRegressor\n",
     "\n",
     "# Create a random dataset\n",
     "rng = np.random.RandomState(1)\n",
@@ -685,7 +648,6 @@
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "metadata": {
     "id": "rdBVk3fyf5gZ"
@@ -831,7 +793,6 @@
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "metadata": {
     "id": "SyA46Ie6f5gc"
@@ -938,7 +899,6 @@
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "metadata": {
     "id": "KLKHwCjOf5gg"
@@ -956,20 +916,6 @@
    },
    "outputs": [],
    "source": [
-    "# Authors: Clay Woolam   <clay@woolam.org>\n",
-    "#          Oliver Rausch <rauscho@ethz.ch>\n",
-    "# License: BSD\n",
-    "\n",
-    "# import numpy as np\n",
-    "# import matplotlib.pyplot as plt\n",
-    "from sklearn import datasets\n",
-    "from sklearn.svm import SVC\n",
-    "from sklearn.semi_supervised import LabelSpreading\n",
-    "from sklearn.semi_supervised import SelfTrainingClassifier\n",
-    "\n",
-    "from matplotlib.ticker import MaxNLocator # needed for integer only on axis\n",
-    "from matplotlib.lines import Line2D # for creating the custom legend\n",
-    "\n",
     "iris = datasets.load_iris()\n",
     "\n",
     "X = iris.data[:, :2]\n",
@@ -1040,7 +986,7 @@
     "    \n",
     "    z_proba =  clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])\n",
     "    # the size of each probability dot\n",
-    "    Z_size = np.max(Z_proba, axis=1) \n",
+    "    Z_size = np.max(z_proba, axis=1) \n",
     "    \n",
     "    Z = Z.reshape(xx.shape)\n",
     "    \n",
@@ -1062,7 +1008,6 @@
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "metadata": {
     "id": "jGL-HbYlf5gi"
@@ -1083,7 +1028,7 @@
     "# set study globally here\n",
     "# study = \"1gzk7msy.optuna:v49\" # gbm classical\n",
     "# study = \"3vntumoi.optuna:v49\" # gbm classical-size\n",
-    "# study = \"2t5zo50f.optuna:v49\" # gbm ml\n",
+    "study = \"2t5zo50f.optuna:v49\" # gbm ml\n",
     "\n",
     "# study = \"37lymmzc.optuna:v49\" # gbm semi-classical\n",
     "# study = \"1vmti6db.optuna:v49\" # gbm semi classical-size\n",
@@ -1092,7 +1037,7 @@
     "# transformer \n",
     "# study = \"3jpe46s1.optuna:v9\" # transformer classical\n",
     "# study = \"1qx3ul4j.optuna:v9\" # transformer classical-size\n",
-    "study = \"2h81aiow.optuna:v9\" # transformer ml"
+    "# study = \"2h81aiow.optuna:v9\" # transformer ml"
    ]
   },
   {
@@ -1307,6 +1252,12 @@
     "                    cs_list.append(cs)\n",
     "        if cs_list:\n",
     "            axcb = fig.colorbar(cs_list[0], ax=axs, aspect=50)\n",
+    "            # Create a formatter function for percentages\n",
+    "            formatter = ticker.PercentFormatter(xmax=1.0, decimals=2)\n",
+    "\n",
+    "            # Set the formatter for the colorbar\n",
+    "            axcb.ax.yaxis.set_major_formatter(formatter)\n",
+    "            \n",
     "            axcb.set_label(\"Accuracy\")\n",
     "\n",
     "    return axs\n",
@@ -1619,7 +1570,6 @@
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "metadata": {
     "id": "roRmlg_nf5gl"
@@ -1930,7 +1880,6 @@
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "metadata": {
     "id": "r5ZnoZIG26K_"
@@ -1947,11 +1896,6 @@
    },
    "outputs": [],
    "source": [
-    "import numpy as np\n",
-    "from torch import optim\n",
-    "from torch import nn\n",
-    "\n",
-    "\n",
     "class CosineWarmupScheduler(optim.lr_scheduler._LRScheduler):\n",
     "    def __init__(self, optimizer, warmup, max_iters):\n",
     "        self.warmup = warmup\n",
@@ -2028,7 +1972,6 @@
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "metadata": {
     "id": "hc7pkVqe4qNw"
diff --git a/notebooks/6.0f-mb-viz-gradient-boosting.ipynb b/notebooks/6.0f-mb-viz-gradient-boosting.ipynb
index e20db5ef..6c7e5414 100644
--- a/notebooks/6.0f-mb-viz-gradient-boosting.ipynb
+++ b/notebooks/6.0f-mb-viz-gradient-boosting.ipynb
@@ -14,7 +14,6 @@
     "import sys\n",
     "from pathlib import Path\n",
     "\n",
-    "import matplotlib.pyplot as plt\n",
     "\n",
     "import numpy as np\n",
     "import pandas as pd\n",
diff --git a/notebooks/6.0g-mb-viz-fttransformer.ipynb b/notebooks/6.0g-mb-viz-fttransformer.ipynb
index e44ae0c8..36a99e9a 100644
--- a/notebooks/6.0g-mb-viz-fttransformer.ipynb
+++ b/notebooks/6.0g-mb-viz-fttransformer.ipynb
@@ -31,7 +31,6 @@
     "from otc.data.dataset import TabDataset\n",
     "from otc.data.dataloader import TabDataLoader\n",
     "from otc.features.build_features import features_classical\n",
-    "from otc.optim.early_stopping import EarlyStopping\n",
     "from otc.optim.scheduler import CosineWarmupScheduler\n"
    ]
   },
diff --git a/notebooks/6.0h-mb-viz-embeddings.ipynb b/notebooks/6.0h-mb-viz-embeddings.ipynb
index 780db6b0..f6028b3a 100644
--- a/notebooks/6.0h-mb-viz-embeddings.ipynb
+++ b/notebooks/6.0h-mb-viz-embeddings.ipynb
@@ -15,10 +15,9 @@
     "import json\n",
     "import os\n",
     "import pickle\n",
-    "import sys\n",
     "from pathlib import Path\n",
     "\n",
-    "# from adjustText import adjust_text\n",
+    "from adjustText import adjust_text\n",
     "\n",
     "import numpy as np\n",
     "import pandas as pd\n",
@@ -305,7 +304,6 @@
     "    dot_product = matrix_of_vectors @ matrix_of_vectors.t()\n",
     "    norms = torch.sqrt(torch.einsum(\"ii->i\", dot_product))\n",
     "    similarities = dot_product / (norms[None] * norms[..., None])\n",
-    "    # similarities = dot_product / (norms[:, None] * norms[None, :])\n",
     "    return similarities\n"
    ]
   },
@@ -457,9 +455,9 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "myenv",
+   "display_name": "Python 3",
    "language": "python",
-   "name": "myenv"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
@@ -471,7 +469,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.7"
+   "version": "3.9.4"
   }
  },
  "nbformat": 4,
diff --git a/notebooks/9.0e-mb-discussion.ipynb b/notebooks/6.0i-mb-discussion.ipynb
similarity index 80%
rename from notebooks/9.0e-mb-discussion.ipynb
rename to notebooks/6.0i-mb-discussion.ipynb
index a015ad04..8b650480 100644
--- a/notebooks/9.0e-mb-discussion.ipynb
+++ b/notebooks/6.0i-mb-discussion.ipynb
@@ -10,16 +10,13 @@
    "outputs": [],
    "source": [
     "import os\n",
-    "import random\n",
     "import sys\n",
     "from pathlib import Path\n",
     "\n",
     "import numpy as np\n",
     "import pandas as pd\n",
-    "from sklearn.metrics import accuracy_score\n",
     "\n",
     "sys.path.append(\"..\")\n",
-    "import warnings\n",
     "\n",
     "import wandb\n",
     "from tqdm.auto import tqdm\n"
@@ -388,89 +385,6 @@
     "               margins=True)\n",
     "pivot_table.div(pivot_table.iloc[-1], axis=1)"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "# mid p. 31 + extra category for unknowns\n",
-    "ask = X_print[\"ask_ex\"]\n",
-    "bid = X_print[\"bid_ex\"]\n",
-    "trade_price = X_print[\"TRADE_PRICE\"]\n",
-    "\n",
-    "# require ask >= bid\n",
-    "mid = np.where(ask >= bid, (ask + bid) * 0.5, np.nan)\n",
-    "\n",
-    "results = []\n",
-    "\n",
-    "# calculate true rel effective spread but not aggregated, convert to %\n",
-    "es_true = effective_spread(X_print[\"buy_sell\"], X_print[\"TRADE_PRICE\"], mid, mode=\"none\")\n",
-    "nom_true = np.nanmean(es_true)\n",
-    "\n",
-    "eps_true = np.empty(es_true.shape)\n",
-    "np.divide(es_true, mid, out=eps_true, where=mid != 0)\n",
-    "rel_true = np.nanmean(eps_true)\n",
-    "\n",
-    "\n",
-    "for classifier in tqdm(classifiers):\n",
-    "\n",
-    "    # calculate pred rel effective spread but not aggregated convert to %\n",
-    "    es_pred = effective_spread(X_print[classifier], X_print[\"TRADE_PRICE\"], mid, mode=\"none\")\n",
-    "    \n",
-    "    eps_pred = np.empty(es_pred.shape)\n",
-    "    np.divide(es_pred, mid, out=eps_pred, where=mid != 0)\n",
-    "\n",
-    "    wilcoxon_res  = wilcoxon(eps_pred, eps_true, nan_policy=\"omit\", zero_method=\"zsplit\")\n",
-    "\n",
-    "    res = pd.Series(\n",
-    "            {\n",
-    "                \"nom_pred\": np.nanmean(es_pred),\n",
-    "                \"rel_pred\": np.nanmean(eps_pred),\n",
-    "                \"statistic\":wilcoxon_res.statistic,\n",
-    "                \"pvalue\":wilcoxon_res.pvalue,\n",
-    "            }, name=classifier\n",
-    "        )\n",
-    "    results.append(res)\n",
-    "\n",
-    "true_eff = pd.Series({\"nom_pred\":nom_true, \"rel_pred\": rel_true, \"statistic\":np.NaN, \"pvalue\":np.NaN}, name=\"true_eff\")\n",
-    "\n",
-    "results.append(true_eff)\n",
-    "\n",
-    "results = pd.concat(results, axis=1)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "results.T.style.format(\"{:.3f}\")\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "results.T.style.to_latex(\n",
-    "    f\"../reports/Content/{KEY}-eff-spread.tex\",\n",
-    "    siunitx=True,\n",
-    "    position_float=\"centering\",\n",
-    "    hrules=True,\n",
-    "    clines=\"skip-last;data\",\n",
-    "    label=f\"tab:eff-{KEY}\",\n",
-    "    caption=(f\"long-eff-{KEY}\", f\"short-eff-{KEY}\"),\n",
-    "    convert_css=True,\n",
-    ")\n"
-   ]
   }
  ],
  "metadata": {
diff --git "a/references/obsidian/\360\237\223\221notes/\360\237\221\266introduction notes.md" "b/references/obsidian/\360\237\223\221notes/\360\237\221\266introduction notes.md"
index d9bee9e1..8aa09c18 100644
--- "a/references/obsidian/\360\237\223\221notes/\360\237\221\266introduction notes.md"	
+++ "b/references/obsidian/\360\237\223\221notes/\360\237\221\266introduction notes.md"	
@@ -31,4 +31,6 @@ Our work makes the following contributions:
 2. In a real-world setting, labelled trades, or trades for which the true initiator is known, are sparse, but unlabelled trades are abundant. Motivated by this consideration, we  extend our classifiers to learn on both labelled and unlabelled data through pre-training and self-training procedures and study the impact on classification accuracy. Specifically, we can show, that pre-training of Transformers improves accuracy on gls-ise data.
 3. We strive to understand which features are most predictive. Through a game-theoretic approach, our work is the first to consistently attribute the performance of rule-based trade classification and machine learning-based predictors to individual features. We find that, both paradigms share a common set of features, but machine learning-based classifiers attain higher performance gains and thus better exploit the data. By probing and visualising the attention mechanism inside the Transformer, we can further strengthen the link to rule-based classification and uncover that the *learned* rules mimic *classical* rules.
 
+We employ state-of-the-art supervised algorithms i.~e., gradient-boosted trees and Transformer networks to the problem of trade classification and benchmark these approaches against rules-based methods. Our approaches outperform all rule-based approaches on \gls{ISE} and \gls{CBOE} data with comparable data requirements and performance is robust between subsets. Our smallest and medium-sized Transformer outperform the best previously reported rules from ([[@grauerOptionTradeClassification2022]]13--15) by percentage-3.73 to percentage-4.97 on gls-ISE and 5.44 to percentage-5.64 on gls-CBOE. With option features, accuracies reach up to percentage-74.28 percentage-(+7.76). For gradient-boosting improvements relative to benchmark range between percentage-3.62 and percentage-6.51 on gls-ISE, and between percentage-5.26 and percentage-7.86 on gls-CBOE, depending on the features in use.
 
+We apply our classifiers to the problem of effective spread estimation. On gls-CBOE data, our models approximate the true effective spread of percentage-2.5 best versus an estimated spread of percentage-5.7. On gls-ISE they are among the best performing solutions.
\ No newline at end of file
diff --git "a/references/obsidian/\360\237\223\221notes/\360\237\233\214 Token embeddings notes.md" "b/references/obsidian/\360\237\223\221notes/\360\237\233\214Token embeddings notes.md"
similarity index 100%
rename from "references/obsidian/\360\237\223\221notes/\360\237\233\214 Token embeddings notes.md"
rename to "references/obsidian/\360\237\223\221notes/\360\237\233\214Token embeddings notes.md"
diff --git "a/references/obsidian/\360\237\223\226chapters/\360\237\217\205Results of semi-supervised.md" "b/references/obsidian/\360\237\223\226chapters/\360\237\217\205Results of semi-supervised.md"
index 633866b3..8f9ea04e 100644
--- "a/references/obsidian/\360\237\223\226chapters/\360\237\217\205Results of semi-supervised.md"	
+++ "b/references/obsidian/\360\237\223\226chapters/\360\237\217\205Results of semi-supervised.md"	
@@ -1,3 +1,29 @@
+The intuition behind BERT is that the early layers learn generic linguistic patterns that have little relevance to the downstream task, while the later layers learn task-specific patterns. This intuition is in line with deep computer vision models, where the early layers learn generic features such as edges and corners, and the later layers learn specific features, such as eyes and noses in the case of facial detection.
+
+This intuition has been experimentally confirmed by another Google team, Amil Merchant et al, in their work “[What Happens To BERT Embeddings During Fine-tuning?](https://arxiv.org/pdf/2004.14448.pdf)” One of their techniques is called partial freezing: they keep the early BERT layers frozen, i.e. fixed, during the fine-tuning process, and measure how much the performance on the downstream task changes when varying the number of frozen layers. They show that the performance on both MNLI and SQuAD tasks does not notably drop even when freezing the first 8 of the 12 BERT layers (i.e. tuning only the last 4).
+
+This finding corroborates the intuition that the the last layers are the most task-specific, and therefore change the most during the fine-tuning process, while the early layers remain relatively stable. The results also imply that practitioners can potentially save compute resources by freezing the early layers instead of training the entire network during fine-tuning.
+
+“Impact of target domain Pretrained language model representations are intended to be universal. However, the target domain might still impact the adaptation performance. We calculate the Jensen-Shannon divergence based on term distributions (Ruder and Plank, 2017) between the domains used to train BERT (books and Wikipedia) and each MNLI domain. We show results in Table 6. We find no significant correlation. At least for this task, the distance of the source and target domains does not seem to have a major impact on the adaptation performance.” ([[@petersTuneNotTune2019]], p. 4)
+
+“Past work in NLP (Mou et al., 2016) showed that similar pretraining tasks transfer better.1 In computer vision (CV), Yosinski et al. (2014) similarly found that the transferability of 1Mou et al. (2016), however, only investigate transfer between classification tasks (NLI → SICK-E/MRPC). features decreases as the distance between the pretraining and target task increases. In this vein, Skip-thoughts—and Quick-thoughts (Logeswaran and Lee, 2018), which has similar performancewhich use a next-sentence prediction objective similar to BERT, perform particularly well on STS tasks, indicating a close alignment between the pretraining and target task. This strong alignment also seems to be the reason for BERT’s strong relative performance on these tasks.” ([[@petersTuneNotTune2019]], p. 3)
+
+“We find that while fine-tuning necessarily makes significant changes, it does not lead to catastrophic forgetting of linguistic phenomena. We instead find that fine-tuning primarily affects the top layers of BERT, but with noteworthy variation across tasks.” ([[@merchantWhatHappensBERT2020]] p. 1)
+
+“fine-tuned Transformers achieve state-of-the-art performance but also can end up learning shallow shortcuts, heuristics, and biases (McCoy et al., 2019b,a; Gururangan et al., 2018; Poliak et al., 2018).” ([[@merchantWhatHappensBERT2020]], p. 1)
+
+Transformer-based models (Vaswani et al., 2017), analyses of attention weights have shown interpretable patterns in their structure (Coenen et al., 2019; Vig and Belinkov, 2019; Voita et al., 2019b; Hoover et al., 2019) and found strong correlations to syntax (Clark et al., 2019). However, other studies have also cast doubt on what conclusions can be drawn from attention patterns (Jain and Wallace, 2019; Serrano and Smith, 2019; Brunner et al., 2019).
+
+“Next, our results using RSA and layer ablations show that the changes from fine-tuning alter a fraction of the model capacity, specifically within the top few layers (up to some variation across tasks). Also, although fine-tuning has a significant affect on the representations of in-domain sentences, the representations of out-of-domain examples remain much closer to those of the pre-trained model.” (Merchant et al., 2020, p. 9)
+
+“Generalization is a crucial component of learning a language. No training set can contain all possible sentences, so learners must be able to generalize to sentences that they have never encountered before. We differentiate two types of generalization: 1. In-distribution generalization: Generalization to examples which are novel but which are drawn from the same distribution as the training set. 2. Out-of-distribution generalization: Generalization to examples drawn from a different distribution than the training set.” (McCoy et al., 2020, p. 1)
+
+“owever, this strong performance does not necessarily indicate mastery of language. Because of biases in training distributions, it is often possible for a model to achieve strong in-distribution generalization by using shallow heuristics rather than deeper linguistic knowledge.” (McCoy et al., 2020, p. 1)
+
+“e found that these 100 instances were remarkably consistent in their in-distribution generalization accuracy, with all accuracies on the MNLI development set falling in the range 83.6% to 84.8%, and with a high level of consistency on labels for specific examples (e.g., we identified 526 examples that all 100 instances labeled incorrectly). In contrast, these 100 instances varied dramatically in their out-of-distribution generalization performance; for example, on one of the thirty categories of examples in the HANS dataset, accuracy ranged from 4% to 76%.” ([[@mccoyBERTsFeatherNot2020]], p. 2)
+
+
+
 We compare the performance of pre-trained Transformers and self-trained gradient-boosting on the gls-ise and gls-cboe test set. Results are reported in cref-tab-semi-supervised-results. 
 ![[Pasted image 20230701154037.png]]
 (supervised)
diff --git "a/references/obsidian/\360\237\223\226chapters/\360\237\233\214Token Embedding.md" "b/references/obsidian/\360\237\223\226chapters/\360\237\233\214Token Embedding.md"
index e725e590..3294b41a 100644
--- "a/references/obsidian/\360\237\223\226chapters/\360\237\233\214Token Embedding.md"	
+++ "b/references/obsidian/\360\237\223\226chapters/\360\237\233\214Token Embedding.md"	
@@ -53,4 +53,4 @@ Embeddings can only encode the semantic relationship of tokens, but they do not
 [^2:] Throughout this work, we adhere to a notation suggested in [[@phuongFormalAlgorithmsTransformers2022]] (p. 1 f) to maintain consistency.
 
 **Notes:**
-[[🛌 Token embeddings notes]]
\ No newline at end of file
+[[🛌Token embeddings notes]]
\ No newline at end of file
diff --git "a/references/obsidian/\360\237\223\245Inbox/@caoInformationalContentOption2005.md" "b/references/obsidian/\360\237\223\245Inbox/@caoInformationalContentOption2005.md"
index deee3a97..7ca402c4 100644
--- "a/references/obsidian/\360\237\223\245Inbox/@caoInformationalContentOption2005.md"
+++ "b/references/obsidian/\360\237\223\245Inbox/@caoInformationalContentOption2005.md"
@@ -1,7 +1,7 @@
 *title:* Informational Content of Option Volume Prior to Takeovers
 *authors:* Charles Cao, Zhiwu Chen, John M. Griffin
 *year:* 2005
-*tags:* 
+*tags:* #application #trade-classification 
 *status:* #📥
 *related:*
 *code:*
@@ -10,4 +10,5 @@
 ## Notes 📍
 
 ## Annotations 📖
-Note: 
\ No newline at end of file
+
+“To appreciate the informational content of option and stock volume, we examine buyer- and seller-initiated volume. The BODB, ISSM, and TAQ have no information on whether a trade is buyer or seller initiated, one must use intraday trade and quote data to classify trades. We adopt an algorithm similar to the ones used by Lee and Ready (1991) for stock trades and by Vijh (1990), Amin and Lee (1997), and Easley et al. (1998) for option trades. Specifically, we assign a trade as a buy (sell) if it occurs above (below) the bid-ask midpoint. For trades executed at the bid-ask midpoints, we classify the trade as a buy (sell) if its trade price is higher (lower) than its preceding price. All other trades are classified as cross-trades and excluded.” (Cao et al., 2005, p. 1079)
\ No newline at end of file
diff --git "a/references/obsidian/\360\237\223\245Inbox/@finucaneDirectTestMethods2000.md" "b/references/obsidian/\360\237\223\245Inbox/@finucaneDirectTestMethods2000.md"
index c256f22b..1c547bbb 100644
--- "a/references/obsidian/\360\237\223\245Inbox/@finucaneDirectTestMethods2000.md"
+++ "b/references/obsidian/\360\237\223\245Inbox/@finucaneDirectTestMethods2000.md"
@@ -60,4 +60,6 @@ related:
 
 “Table 5 contains the maximum likelihood coefficient estimates and asso? ciated x2-statistics for the two models, together with estimates of the marginal change in the probability of correctly classifying an observation for a one unit change in each independent variable.” ([Finucane, 2000, p. 566](zotero://select/library/items/KKJY6E7W)) ([pdf](zotero://open-pdf/library/items/RQ8KUGBP?page=15&annotation=MH5DIL3R))
 
-“show that efforts to philtre data in an attempt to increase classification accuracy may further exacerbate these biases. Somewhat surprisingly, although the classification error rates are slightly smaller for LR's method than for the tick test, the biases for estimated effective spreads and signed volume are smaller for the tick test than for LR's method. These findings sug? gest that researchers using the tick test to classify trades will achieve results that are close to the results that can be achieved using quote-based methods and, in at least some applications, the tick test may provide more accurate measures than quote-based methods.” ([Finucane, 2000, p. 574](zotero://select/library/items/KKJY6E7W)) ([pdf](zotero://open-pdf/library/items/RQ8KUGBP?page=23&annotation=GZHPZHDJ))
\ No newline at end of file
+“show that efforts to philtre data in an attempt to increase classification accuracy may further exacerbate these biases. Somewhat surprisingly, although the classification error rates are slightly smaller for LR's method than for the tick test, the biases for estimated effective spreads and signed volume are smaller for the tick test than for LR's method. These findings sug? gest that researchers using the tick test to classify trades will achieve results that are close to the results that can be achieved using quote-based methods and, in at least some applications, the tick test may provide more accurate measures than quote-based methods.” ([Finucane, 2000, p. 574](zotero://select/library/items/KKJY6E7W)) ([pdf](zotero://open-pdf/library/items/RQ8KUGBP?page=23&annotation=GZHPZHDJ))
+
+“The significantly positive coefficient on the time between quotes (X13) for both algorithms implies that classification accuracy is reduced when quote changes occur frequently, likely because the problem of asynchronous prices is more severe when the time between quotes is short. Increasing the time between trades (X12) has the opposite effect; shorter times between trades result in more accurate classifications for both algorithms. The negative coefficient on the time between trades supports the hypothesis that shorter times between trades are associated with minimum variation markets, as well as the hypothesis that market conditions will change and adversely affect classification accuracy for the tick test as the time between trades increases.” (Finucane, 2000, p. 568)
\ No newline at end of file
diff --git "a/references/obsidian/\360\237\223\245Inbox/@garleanuDemandBasedOptionPricing2009.md" "b/references/obsidian/\360\237\223\245Inbox/@garleanuDemandBasedOptionPricing2009.md"
index c16a047f..eca202b5 100644
--- "a/references/obsidian/\360\237\223\245Inbox/@garleanuDemandBasedOptionPricing2009.md"
+++ "b/references/obsidian/\360\237\223\245Inbox/@garleanuDemandBasedOptionPricing2009.md"
@@ -17,6 +17,14 @@ related:
 
 ### Data Set
 
+“We use a unique dataset to identify aggregate daily positions of dealers and end-users. In particular, we define dealers as market-makers and end-users as proprietary traders and customers of brokers.2 We are the first to document that end-users have a net long position in S&P 500 index options with large net positions in out-of-the-money (OTM) puts.” (Gârleanu et al., 2009, p. 4261)
+
 “We acquire the data from two different sources. Data for computing net option demand were obtained directly from the Chicago Board Options Exchange (CBOE). These data consist of a daily record of closing short and long open interest on all SPX and equity options for public customers and firm proprietary traders.” ([Gârleanu et al., 2009, p. 16](zotero://select/library/items/U4LJ77Z9)) ([pdf](zotero://open-pdf/library/items/LKI84IS2?page=17&annotation=GE898JI6))
 
-“The other main source of data for this paper is the Ivy DB data set from OptionMetrics LLC. The OptionMetrics data include end-of-day volatilities implied from option prices, and we use the volatilities implied from SPX and CBOE listed equity options from the beginning of 1996 through the end of 2001. SPX options have European style exercise, and OptionMetrics computes implied volatilities by inverting the Black-Scholes formula.” ([Gârleanu et al., 2009, p. 17](zotero://select/library/items/U4LJ77Z9)) ([pdf](zotero://open-pdf/library/items/LKI84IS2?page=18&annotation=ZGYHPF5G))
\ No newline at end of file
+“The other main source of data for this paper is the Ivy DB data set from OptionMetrics LLC. The OptionMetrics data include end-of-day volatilities implied from option prices, and we use the volatilities implied from SPX and CBOE listed equity options from the beginning of 1996 through the end of 2001. SPX options have European style exercise, and OptionMetrics computes implied volatilities by inverting the Black-Scholes formula.” ([Gârleanu et al., 2009, p. 17](zotero://select/library/items/U4LJ77Z9)) ([pdf](zotero://open-pdf/library/items/LKI84IS2?page=18&annotation=ZGYHPF5G))
+
+## Index vs. equity options
+
+“The end-user demand for index options can help to explain the two puzzles that index options appear to be expensive, and that low-moneyness options seem to be especially expensive (Rubinstein 1994; Longstaff 1995; Bates 2000; Jackwerth 2000; Coval and Shumway 2001; Bondarenko 2003; Amin, Coval, and Seyhun 2004; Driessen and Maenhout 2008). In the time series, the model-based impact of demand for index options is positively related to their expensiveness, measured by the difference between their implied volatility and the volatility measure of Bates (2006). Indeed, we estimate that on the order of one-third of index-option expensiveness can be accounted for by demand effects.5” (Gârleanu et al., 2009, p. 4261)
+
+“Another option-pricing puzzle is the significant difference between indexoption prices and the prices of single-stock options, despite the relative similarity of the underlying distributions (e.g., Bakshi, Kapadia, and Madan 2003; Bollen and Whaley 2004). In particular, single-stock options appear cheaper and their smile is flatter. Consistently, we find that the demand pattern for single-stock options is very different from that of index options. For instance, end-users are net short single-stock options—not long, as in the case of index options. Demand patterns further help to explain the cross-sectional pricing of single-stock options. Indeed, individual stock options are relatively cheaper for stocks with more negative demand for options.” (Gârleanu et al., 2009, p. 4262)
\ No newline at end of file
diff --git "a/references/obsidian/\360\237\223\245Inbox/@grauerOptionTradeClassification2022.md" "b/references/obsidian/\360\237\223\245Inbox/@grauerOptionTradeClassification2022.md"
index 842a6eb6..8500444c 100644
--- "a/references/obsidian/\360\237\223\245Inbox/@grauerOptionTradeClassification2022.md"
+++ "b/references/obsidian/\360\237\223\245Inbox/@grauerOptionTradeClassification2022.md"
@@ -59,6 +59,7 @@ related:
 - **Depth rule:** As a consequence, we classify midspread trades as buyer-initiated, if the ask size exceeds the bid size, and as seller-initiated, if the bid size is higher than the ask size.
 - Applying our proposed “depth rule” after using the trade size rule and quote rules improves the performance by around 0.8%.
 - “We show the overall success rates of the classification algorithms using our trade size rule and also calculate the change in the success rates compared to the same algorithms not using the trade size rule in parentheses. The results show that our new rule works best for small to medium-sized trades and even leads to a slight deterioration of the performance for the largest trade sizes.” (Grauer et al., 2022, p. 15)
+- “Based on our findings so far, we recommend that researchers use our new trade size rule together with quote rules successively applied to NBBO and quotes on the trading venue. Quotes at the midpoint on both the NBBO and the exchange should be classified first with the depth rule and any remaining trades with the reverse tick test. Most importantly, the LR algorithm alone, which is heavily used in the literature (see, e.g., Pan and Poteshman (2006); Hu (2014); Easley, O’Hara, and Srinivas (1998)), does a poor job to identify buy and sell orders in option trade data.8 Overall, the accuracy of all common classification algorithms to infer option trade direction can be significantly improved by our two new rules” (Grauer et al., 2023, p. 15)
 
 ## Out-of-sample-tests
 - “Namely, tick tests perform best when using most current price information across all exchanges and reverse tick tests based on subsequent prices dominate their counterparts based on preceding ones.” (Grauer et al., 2022, p. 16)
diff --git "a/references/obsidian/\360\237\223\245Inbox/@leeInferringInvestorBehavior2000.md" "b/references/obsidian/\360\237\223\245Inbox/@leeInferringInvestorBehavior2000.md"
index 3b6b93af..b04d449a 100644
--- "a/references/obsidian/\360\237\223\245Inbox/@leeInferringInvestorBehavior2000.md"
+++ "b/references/obsidian/\360\237\223\245Inbox/@leeInferringInvestorBehavior2000.md"
@@ -1,10 +1,12 @@
 *title:* Inferring investor behaviour: Evidence from TORQ data
 *authors:* Charles M.C. Lee, Balkrishna Radhakrishna
 *year:* 1999
-*tags:* 
+*tags:* #trade-classification #trade-initiator
 *status:* #📥
 *related:*
 
 # Notes 
 
-# Annotations
\ No newline at end of file
+# Annotations
+
+“For this purpose, we de"ne the following as clearly active or passive: Clearly active Clearly passive Non-stopped market orders Non-executable limit orders Executable limit orders Stopped market-orders ITS commitments ITS executions” (Lee and Radhakrishna, 2000, p. 97)
\ No newline at end of file
diff --git "a/references/obsidian/\360\237\223\245Inbox/@leeInferringTradeDirection1991.md" "b/references/obsidian/\360\237\223\245Inbox/@leeInferringTradeDirection1991.md"
index 40cf5cb8..bb0d31ec 100644
--- "a/references/obsidian/\360\237\223\245Inbox/@leeInferringTradeDirection1991.md"
+++ "b/references/obsidian/\360\237\223\245Inbox/@leeInferringTradeDirection1991.md"
@@ -12,7 +12,6 @@ related:
 **code:**
 - https://github.com/jblocher/sas_util/blob/master/LR_Trade_ID.sas
 
-
 ## Notes
 - Most cited paper in journal of finance.
 - Authors propose the LR algorithm to classify individual trades as market buys and sell orders using intraday trade / execution prices and quote data. The LR algorithm fuses two commonly used algorithms. Use quote rule in general (due to performance) but apply tick rule to trades at the midpoint spread.
@@ -25,7 +24,6 @@ related:
 - Tick test is relatively imprecise when compared with the quote rule. This is esspecially evident if the prevailing quote has changed or if the quote is a long time back.
 - An alternative is the reverse tick test ([[@hasbrouckTradesQuotesInventories1988]]), that compares the trade price against the prices of trades immediately following the trade. If the following price is higher, the reverse tick test classifies the current trade as a sell.
 - I the trade is bracketed by price reversal (price change before the trade is the opposite to the price change after the trade) the reverse tick test and the tick test yield the same results.
-![[tick-rule-reverse-tick-rule.png]]
 - Authors study a sample of NYSE stocks. Authors do not know true labels. This limits their evaluation e. g. what is the true direction of a trade inside the spread. For trades at the ask 92.1 % are classified as buys using the tick test and 90.2 % at the bid are classified as sells. Thus there is a high degree of agreement between the tick test and quote rule, if the prevailing quote is unambiguous. Unambigous means that the quote reversion occured more than 5 sec before the trade. Quote reversions are generally triggered by trades.
 - Trades and quotes can be out of their natural order depending on how they are entered into the system (Problem 1). Authors observe that quote reversions are clustered near the trade with a substantial portion of quote recorded ahead of the trade.  Authors suggest an adjustment. If the current quote is less than 5 sec old it was probably caused by the trade an so the previous quote should be used for classification. They note that a different delay might be appropriate for other markets and that the 5 sec rule was derived from the AMEX and NYSE sample.
 - When a trade causes a quote reversion, the new quote tends to straddle the trade that triggered it. If new quotes are however recorded ahead of time, the current quote could cause a larger number of trades to appear inside the spread.
@@ -105,4 +103,6 @@ related:
 
 “We present evidence that trading inside the spread is due largely to “standing orders” that cause the effective spread to be narrower than the quoted spread” ([Lee and Ready, 1991, p. 14](zotero://select/library/items/FW283V5Z)) ([pdf](zotero://open-pdf/library/items/SVM9XEPW?page=14&annotation=J2SNEGZM))
 
-“For trades closer to the bid or ask we show that the tick test continues to perform well, although a simple assignment of trades as buys (sells),if they are closer to the bid (ask), will also perform well.” ([Lee and Ready, 1991, p. 14](zotero://select/library/items/FW283V5Z)) ([pdf](zotero://open-pdf/library/items/SVM9XEPW?page=14&annotation=PCR8DYSJ))
\ No newline at end of file
+“For trades closer to the bid or ask we show that the tick test continues to perform well, although a simple assignment of trades as buys (sells),if they are closer to the bid (ask), will also perform well.” ([Lee and Ready, 1991, p. 14](zotero://select/library/items/FW283V5Z)) ([pdf](zotero://open-pdf/library/items/SVM9XEPW?page=14&annotation=PCR8DYSJ))
+
+“However, based on the data for 114 and 318 spread, we expect that the best approach for these trades is to classify those that occur in the middle of the spread using the tick test and other trades inside the spread as buys (sells) if they are closer to the ask (bid).” (Lee and Ready, 1991, p. 13)
\ No newline at end of file
diff --git "a/references/obsidian/\360\237\223\245Inbox/@mccoyBERTsFeatherNot2020.md" "b/references/obsidian/\360\237\223\245Inbox/@mccoyBERTsFeatherNot2020.md"
new file mode 100644
index 00000000..f9a2be55
--- /dev/null
+++ "b/references/obsidian/\360\237\223\245Inbox/@mccoyBERTsFeatherNot2020.md"
@@ -0,0 +1,13 @@
+*title:* BERTs of a feather do not generalize together: Large variability in generalization across models with similar test set performance
+*authors:* R. Thomas McCoy, Junghyun Min, Tal Linzen
+*year:* 2020
+*tags:* 
+*status:* #📥
+*related:*
+*code:*
+*review:*
+
+## Notes 📍
+
+## Annotations 📖
+Note: 
\ No newline at end of file
diff --git "a/references/obsidian/\360\237\223\245Inbox/@merchantWhatHappensBERT2020.md" "b/references/obsidian/\360\237\223\245Inbox/@merchantWhatHappensBERT2020.md"
new file mode 100644
index 00000000..a95a96a1
--- /dev/null
+++ "b/references/obsidian/\360\237\223\245Inbox/@merchantWhatHappensBERT2020.md"
@@ -0,0 +1,13 @@
+*title:* What Happens To BERT Embeddings During Fine-tuning?
+*authors:* Amil Merchant, Elahe Rahimtoroghi, Ellie Pavlick, Ian Tenney
+*year:* 2020
+*tags:* 
+*status:* #📥
+*related:*
+*code:*
+*review:*
+
+## Notes 📍
+
+## Annotations 📖
+Note: 
\ No newline at end of file
diff --git "a/references/obsidian/\360\237\223\245Inbox/@muravyevOptionsTradingCosts2020.md" "b/references/obsidian/\360\237\223\245Inbox/@muravyevOptionsTradingCosts2020.md"
index 5105d470..81cd2fbb 100644
--- "a/references/obsidian/\360\237\223\245Inbox/@muravyevOptionsTradingCosts2020.md"
+++ "b/references/obsidian/\360\237\223\245Inbox/@muravyevOptionsTradingCosts2020.md"
@@ -1,7 +1,7 @@
 *title:* Options Trading Costs Are Lower than You Think
 *authors:* Dmitriy Muravyev, Neil D Pearson
 *year:* 2020
-*tags:* 
+*tags:* #application #trade-classification 
 *status:* #📥
 *related:*
 *code:*
@@ -10,4 +10,6 @@
 ## Notes 📍
 
 ## Annotations 📖
-Note: 
\ No newline at end of file
+
+## Trade Direction
+“hat the mean of the trade direction (buy = 1, sell = −1) is −0.034 implies there are slightly more seller-initiated trades (51.7%) than buyer-initiated trades (48.3%). The trade direction is determined by the quote rule, and if a trade price is at the NBBO midpoint then the quote rule is applied to the best quotes of the reporting exchange.” (Muravyev and Pearson, 2020, p. 4980)
\ No newline at end of file
diff --git "a/references/obsidian/\360\237\223\245Inbox/@muravyevOrderFlowExpected2016.md" "b/references/obsidian/\360\237\223\245Inbox/@muravyevOrderFlowExpected2016.md"
index 26069060..43fae362 100644
--- "a/references/obsidian/\360\237\223\245Inbox/@muravyevOrderFlowExpected2016.md"
+++ "b/references/obsidian/\360\237\223\245Inbox/@muravyevOrderFlowExpected2016.md"
@@ -14,4 +14,8 @@
 
 “OPRA does not report option trade direction, and thus I infer this by applying the quote rule to the NBBO. If the trade is at the midpoint of the NBBO, the quote rule is applied to the best bid offer (BBO) from the exchange at which the trade occurs. In the Internet Appendix, I argue that this algorithm has small estimation error.” ([Muravyev, 2016, p. 688](zotero://select/library/items/5Q2232XU)) ([pdf](zotero://open-pdf/library/items/RVHGHBH8?page=16&annotation=5XAHY95U))
 
-“The data only include transactions that were executed at ISE; however, throughout most of the sample period, ISE was the largest equity options exchange with a market share of about 30%. OptionMetrics is a common source of price information on equity options. For each option contract, it contains end-of-day best bid and ask prices as well as other information such as volume, open interest, implied volatility, and option Greeks. Returns and volume for the underlying stocks are also taken from OptionMetrics to avoid data loss from merging with CRSP.” ([Muravyev, 2016, p. 688](zotero://select/library/items/5Q2232XU)) ([pdf](zotero://open-pdf/library/items/RVHGHBH8?page=16&annotation=XU4RCY39))
\ No newline at end of file
+“The data only include transactions that were executed at ISE; however, throughout most of the sample period, ISE was the largest equity options exchange with a market share of about 30%. OptionMetrics is a common source of price information on equity options. For each option contract, it contains end-of-day best bid and ask prices as well as other information such as volume, open interest, implied volatility, and option Greeks. Returns and volume for the underlying stocks are also taken from OptionMetrics to avoid data loss from merging with CRSP.” ([Muravyev, 2016, p. 688](zotero://select/library/items/5Q2232XU)) ([pdf](zotero://open-pdf/library/items/RVHGHBH8?page=16&annotation=XU4RCY39))
+
+## Trade Classification
+
+“I choose particular methods for inferring trade direction I BS and expected price changes E(μt|Ft). Standard algorithms (such as the quote rule) correctly classify the sign of the vast majority of trades.” (Muravyev, 2016, p. 684)
\ No newline at end of file
diff --git "a/references/obsidian/\360\237\223\245Inbox/@panInformationOptionVolume2006.md" "b/references/obsidian/\360\237\223\245Inbox/@panInformationOptionVolume2006.md"
index 602470a7..02a60a1b 100644
--- "a/references/obsidian/\360\237\223\245Inbox/@panInformationOptionVolume2006.md"
+++ "b/references/obsidian/\360\237\223\245Inbox/@panInformationOptionVolume2006.md"
@@ -1,7 +1,7 @@
 *title:* The information in option volume for future stock prices
 *authors:* Jun Pan, Allen M. Poteshman
 *year:* 2006
-*tags:* 
+*tags:* #option #trade-classification #application 
 *status:* #📥
 *related:*
 *code:*
@@ -10,4 +10,5 @@
 ## Notes 📍
 
 ## Annotations 📖
-Note: 
\ No newline at end of file
+
+“This classification of trade types provides two advantages over the data sets that have been used previously. First, we know with certainty the ‘‘sign’’ of the trading volume. By contrast, the existing literature on the informational content of option trading volume at best infers the sign, with some error, from quote and trade information using the Lee and Ready (1991) algorithm.14 Second, unlike the previous literature, we know whether the initiator of observed volume is opening a new option position or closing one that he or she already had outstanding.” (Pan and Poteshman, 2006, p. 882)
diff --git "a/references/obsidian/\360\237\223\245Inbox/@petersTuneNotTune2019.md" "b/references/obsidian/\360\237\223\245Inbox/@petersTuneNotTune2019.md"
new file mode 100644
index 00000000..4a4401fb
--- /dev/null
+++ "b/references/obsidian/\360\237\223\245Inbox/@petersTuneNotTune2019.md"
@@ -0,0 +1,13 @@
+*title:* To Tune or Not to Tune? Adapting Pretrained Representations to Diverse Tasks
+*authors:* Matthew E. Peters, Sebastian Ruder, Noah A. Smith
+*year:* 2019
+*tags:* 
+*status:* #📥
+*related:*
+*code:*
+*review:*
+
+## Notes 📍
+
+## Annotations 📖
+Note: 
\ No newline at end of file
diff --git "a/references/obsidian/\360\237\223\245Inbox/@poppeSensitivityVPINChoice2016.md" "b/references/obsidian/\360\237\223\245Inbox/@poppeSensitivityVPINChoice2016.md"
index 70bf2fda..cba05886 100644
--- "a/references/obsidian/\360\237\223\245Inbox/@poppeSensitivityVPINChoice2016.md"
+++ "b/references/obsidian/\360\237\223\245Inbox/@poppeSensitivityVPINChoice2016.md"
@@ -10,3 +10,7 @@ related:
 - [[@ellisAccuracyTradeClassification2000]]
 - [[@chakrabartyTradeClassificationAlgorithms2007]]
 
+
+## Annotations
+
+“Fig. 1. Classification algorithms. This chart illustrates the functioning of three different trade-by-trade classification algorithms: LR by Lee and Ready (1991), EMO by Ellis et al. (2000) and CLNV by Chakrabarty et al. (2007).” (Pöppe et al., 2016, p. 167)
\ No newline at end of file
diff --git "a/references/obsidian/\360\237\223\245Inbox/@savickasInferringDirectionOption2003.md" "b/references/obsidian/\360\237\223\245Inbox/@savickasInferringDirectionOption2003.md"
index 2488e80e..9e5df1b7 100644
--- "a/references/obsidian/\360\237\223\245Inbox/@savickasInferringDirectionOption2003.md"
+++ "b/references/obsidian/\360\237\223\245Inbox/@savickasInferringDirectionOption2003.md"
@@ -48,6 +48,8 @@ related:
 - **Observation 6:** “Time to maturity also has an indirect effect on classification precision. Trades in options with longer maturity tend to be smaller, resulting in negative correlation between the effects of maturity and of trade size.” (Savickas and Wilson, 2003, p. 889)
 - **Observation 7:** “Specifically, one of the most noticeable regularities is that smaller trades are classified more precisely. This is because these trades are more likely to be executed at quotes and are less prone to reversed-quote trading (partially due to the fact that many small trades are executed on RAES)” (Savickas and Wilson, 2003, p. 889) **Similar:** [[@grauerOptionTradeClassification2022]]
 - **Observation 8:** “There seems to be a direct relation between using the quote information and successful option trade classification. This relation implies that the information conveyed by the past transaction prices is less relevant than information contained in quotes.” (Savickas and Wilson, 2003, p. 891)
+- “Specifically, the only difference between the tick and the EMO rules is that the latter uses the quote rule for at-the-quote trades. Consequently, the EMO method outperforms the tick rule only for those trades. Similarly, the LR and EMO methods treat at-the-quote and at-midspread trades exactly the same, but the LR approach applies the quote rule to all other trades.” (Savickas and Wilson, 2003, p. 891)
+- “There is an alternative approach to same-sample comparisons. Because considering non-classifiable trades as being misclassified introduces a bias, one can compare all rules by labeling non-classifiable trades for each rule as buys or sells with a 50% probability. This allows the entire sample to be used. Applying this method of whole-sample comparison yields results that are similar to those obtained with the common sample (see Table 1).” (Savickas and Wilson, 2003, p. 887)
 ### Multivariate Analysis
 - For multivariate analysis authors use **logistic regression** to study the most import of 8 covariats in a multivariate setting.
 - “The most economically significant variables in all four regressions are the trade location relative to quotes (including the RAES dummy), the absolute value of the relative underlying price change, and the put/call dummy.” (Savickas and Wilson, 2003, p. 893)
diff --git "a/references/obsidian/\360\237\223\245Inbox/@theissenTestAccuracyLee2000.md" "b/references/obsidian/\360\237\223\245Inbox/@theissenTestAccuracyLee2000.md"
index d9a5f960..41d35d2e 100644
--- "a/references/obsidian/\360\237\223\245Inbox/@theissenTestAccuracyLee2000.md"
+++ "b/references/obsidian/\360\237\223\245Inbox/@theissenTestAccuracyLee2000.md"
@@ -37,4 +37,6 @@
 
 “The performance of the tick test deteriorates dramatically when transactions occurring on a zero tick are considered. For these trades, the classification obtained when using the tick test appears to be unrelated to the true classification. The percentage of correct classifications is only 52.6% and is not significantly different from 50%” ([Theissen, 2000, p. 10](zotero://select/library/items/ESEIBAMC)) ([pdf](zotero://open-pdf/library/items/2XMIU8NA?page=11&annotation=Y5669YUH))
 
-“Overall, the results indicate that, at least for the German stock market, the accuracy of the Lee / Ready trade classification method is limited. The misclassification probability of 27.23% is higher than the corresponding percentages reported for the NYSE and NASDAQ” ([Theissen, 2000, p. 10](zotero://select/library/items/ESEIBAMC)) ([pdf](zotero://open-pdf/library/items/2XMIU8NA?page=11&annotation=34PK6HSV))
\ No newline at end of file
+“Overall, the results indicate that, at least for the German stock market, the accuracy of the Lee / Ready trade classification method is limited. The misclassification probability of 27.23% is higher than the corresponding percentages reported for the NYSE and NASDAQ” ([Theissen, 2000, p. 10](zotero://select/library/items/ESEIBAMC)) ([pdf](zotero://open-pdf/library/items/2XMIU8NA?page=11&annotation=34PK6HSV))
+
+“We now turn to the question of how the high percentage of misclassifications obtained when applying the Lee:Ready method influences the results of empirical studies based on that method. Whether misclassification is a serious problem for empirical applications depends on both the specific application and on whether the bias introduced by the classification algorithm is systematic in nature or not. We address this issue by analyzing two applications of the trade classification algorithm. We first address the estimation of the effective bid –ask spread and its components, and then turn to the generation of the data used for estimation of structural models of the Easley et al. (1996b) type.” (Theissen, 2001, p. 157)
\ No newline at end of file
diff --git a/reports/Content/Appendix.tex b/reports/Content/Appendix.tex
index de66d80e..60f74792 100644
--- a/reports/Content/Appendix.tex
+++ b/reports/Content/Appendix.tex
@@ -13,10 +13,10 @@ \section{Appendix}
         \begin{tabular}{@{}p{3cm}p{3cm}lp{4cm}p{4cm}l@{}}
             \toprule
             Research                                                      & Data                              & Sample Period            & Method                                                                                                 & Baseline                                                     & Improvement              \\ \midrule
-            \autocite[][15]{rosenthalModelingTradeDirection2012}          & \gls{NASDAQ}                      &                          & Logistic regression                                                                                    & \gls{EMO} rule, \gls{LR} rule,\newline and tick rule         & max. \SI{2.2}{\percent}  \\ \cmidrule{2-6}
+            \autocite[\checkmark][411]{rosenthalModelingTradeDirection2012}          & \gls{NASDAQ}                      &                          & Logistic regression                                                                                    & \gls{EMO} rule, \gls{LR} rule,\newline and tick rule         & max. \SI{2.2}{\percent}  \\ \cmidrule{2-6}
                                                                           & \gls{NYSE}                        & 03/12/2004 -- 31/12/2004 & Logistic regression                                                                                    & \gls{EMO} rule, \gls{LR} rule,\newline and tick rule         & max. \SI{1.1}{\percent}  \\\cmidrule{1-6}
-            \autocite[][489--494]{blazejewskiLocalNonParametricModel2005} & Australian Stock\newline Exchange & 11/11/2002 -- 27/08/2003 & $k$ nearest neighbor, \newline logistic regression,\newline trade continuation,\newline majority vote & -                                                            & -                         \\ \cmidrule{1-6}
-            \autocite[][49--57]{ronenMachineLearningTrade2022}            & \gls{TRACE}                       & 01/07/2002 -- 31/12/2019 & Logistic regression, decision tree,\newline neural network, and random forests                         & \gls{LR} rule and tick rule,\newline and \gls{BVC} algorithm & max. \SI{13.3}{\percent} \\ \cmidrule{2-6}
+            \autocite[\checkmark][489--493]{blazejewskiLocalNonParametricModel2005} & Australian Stock\newline Exchange & 11/11/2002 -- 27/08/2003 & $k$ nearest neighbor, \newline logistic regression,\newline trade continuation,\newline majority vote & -                                                            & -                         \\ \cmidrule{1-6}
+            \autocite[\checkmark][49--57]{ronenMachineLearningTrade2022}            & \gls{TRACE}                       & 01/07/2002 -- 31/12/2019 & Logistic regression, decision tree,\newline neural network, and random forests                         & \gls{LR} rule and tick rule,\newline and \gls{BVC} algorithm & max. \SI{13.3}{\percent} \\ \cmidrule{2-6}
                                                                           & \gls{NASDAQ}                      & 09/12/2013 -- 13/12/2013 & Logistic regression, decision tree,\newline neural network, and random forests                         & \gls{LR} rule, tick rule,\newline and \gls{BVC} algorithm    & max. \SI{3.3}{\percent}  \\ \bottomrule
         \end{tabular}
     \end{table}
@@ -109,7 +109,7 @@ \subsection{Autocorrelation of Features}
 \begin{figure}[ht]
     \centering
     \includegraphics{auto-corr-features.pdf}
-    \caption[Autocorrelation of Features]{Autocorrelation Features. Own work.}
+    \caption[Autocorrelation of Features]{Autocorrelation of features.}
     \label{fig:auto-correlation-features}
 \end{figure}
 
diff --git a/reports/Content/bibliography.bib b/reports/Content/bibliography.bib
index dc5bc25c..657f3a89 100644
--- a/reports/Content/bibliography.bib
+++ b/reports/Content/bibliography.bib
@@ -1,11 +1,10 @@
 @article{aasExplainingIndividualPredictions2021,
   title = {Explaining Individual Predictions When Features Are Dependent: More Accurate Approximations to Shapley Values},
-  author = {Aas, Kjersti and Jullum, Martin and Løland, Anders},
+  author = {Aas, Kjersti and Jullum, Martin and L{\o}land, Anders},
   year = {2021},
   journal = {Artificial Intelligence},
   volume = {298},
-  doi = {10.1016/j.artint.2021.103502},
-  urldate = {2023-03-26}
+  doi = {10.1016/j.artint.2021.103502}
 }
 
 @incollection{abeDeepLearningForecasting2018,
@@ -17,18 +16,18 @@ @incollection{abeDeepLearningForecasting2018
   volume = {10937},
   publisher = {{Springer International Publishing}},
   address = {{Cham}},
-  doi = {10.1007/978-3-319-93034-3_22},
+  doi = {10.1007/978-3-319-93034-3\_22},
   urldate = {2021-10-26}
 }
 
-@misc{abnarQuantifyingAttentionFlow2020,
+@inproceedings{abnarQuantifyingAttentionFlow2020,
   title = {Quantifying Attention Flow in Transformers},
+  booktitle = {Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics},
   author = {Abnar, Samira and Zuidema, Willem},
   year = {2020},
-  number = {arXiv:2005.00928},
-  eprint = {2005.00928},
-  publisher = {{arXiv}},
-  archiveprefix = {arxiv}
+  publisher = {{Association for Computational Linguistics}},
+  address = {{Online}},
+  doi = {10.18653/v1/2020.acl-main.385}
 }
 
 @misc{adaloglouHowPositionalEmbeddings2021,
@@ -36,8 +35,8 @@ @misc{adaloglouHowPositionalEmbeddings2021
   author = {Adaloglou, Nikolas},
   year = {2021},
   journal = {AI Summer},
-  urldate = {2021-12-16},
-  howpublished = {https://theaisummer.com/positional-embeddings/}
+  url = {https://theaisummer.com/positional-embeddings/},
+  urldate = {2021-12-16}
 }
 
 @misc{agarapImplementingAutoencoderPyTorch2020,
@@ -45,6 +44,7 @@ @misc{agarapImplementingAutoencoderPyTorch2020
   author = {Agarap, Abien Fred},
   year = {2020},
   journal = {PyTorch},
+  url = {https://medium.com/pytorch/implementing-an-autoencoder-in-pytorch-19baa22647d1},
   urldate = {2021-11-03}
 }
 
@@ -60,7 +60,7 @@ @book{aggarwalRecommenderSystems2016
 
 @article{Aït-Sahalia_2009,
   title = {Estimating the Degree of Activity of Jumps in High Frequency Data},
-  author = {{Aït-Sahalia}, Yacine and Jacod, Jean},
+  author = {{A{\"i}t-Sahalia}, Yacine and Jacod, Jean},
   year = {2009},
   journal = {Annals of Statistics},
   doi = {10.1214/08-aos640},
@@ -123,10 +123,7 @@ @misc{aminiSelfTrainingSurvey2023
   title = {Self-Training: A Survey},
   author = {Amini, Massih-Reza and Feofanov, Vasilii and Pauletto, Loic and Devijver, Emilie and Maximov, Yury},
   year = {2023},
-  number = {arXiv:2202.12040},
   eprint = {2202.12040},
-  publisher = {{arXiv}},
-  urldate = {2023-03-26},
   archiveprefix = {arxiv}
 }
 
@@ -156,22 +153,19 @@ @article{antoniouLognormalDistributionStock2004
 }
 
 @misc{arikTabnetAttentiveInterpretable2020,
-  title = {Tabnet: Attentive Interpretable Tabular Learning},
+  title = {{{TabNet}}: Attentive Interpretable Tabular Learning},
   author = {Arik, Sercan O. and Pfister, Tomas},
   year = {2020},
-  number = {arXiv:1908.07442},
   eprint = {1908.07442},
-  publisher = {{arXiv}},
-  urldate = {2022-10-03},
   archiveprefix = {arxiv}
 }
 
-@article{arpitWhyRegularizedAutoEncoders2016,
+@misc{arpitWhyRegularizedAutoEncoders2016,
   title = {Why Regularized Auto-Encoders Learn Sparse Representation?},
   author = {Arpit, Devansh and Zhou, Yingbo and Ngo, Hung and Govindaraju, Venu},
   year = {2016},
-  journal = {arXiv:1505.05561 [cs, stat]},
   eprint = {1505.05561},
+  url = {http://arxiv.org/abs/1505.05561},
   urldate = {2021-11-15},
   archiveprefix = {arxiv}
 }
@@ -214,34 +208,27 @@ @article{badaroTransformersTabularData
   author = {Badaro, Gilbert and Saeed, Mohammed and Papotti, Paolo}
 }
 
-@misc{bahdanauNeuralMachineTranslation2016,
+@inproceedings{bahdanauNeuralMachineTranslation2016,
   title = {Neural Machine Translation by Jointly Learning to Align and Translate},
+  booktitle = {3rd {{International Conference}} on {{Learning Representations}}},
   author = {Bahdanau, Dzmitry and Cho, Kyunghyun and Bengio, Yoshua},
-  year = {2016},
-  number = {arXiv:1409.0473},
-  eprint = {1409.0473},
-  publisher = {{arXiv}},
-  urldate = {2023-01-30},
-  archiveprefix = {arxiv}
+  year = {2015},
+  address = {{San Diego, CA, USA}}
 }
 
-@misc{bahriSCARFSelfsupervisedContrastive2022,
+@inproceedings{bahriSCARFSelfsupervisedContrastive2022,
   title = {{{SCARF}}: Self-Supervised Contrastive Learning Using Random Feature Corruption},
+  booktitle = {Tenth {{International Conference}} on {{Learning Representations}}},
   author = {Bahri, Dara and Jiang, Heinrich and Tay, Yi and Metzler, Donald},
   year = {2022},
-  number = {arXiv:2106.15147},
-  eprint = {2106.15147},
-  publisher = {{arXiv}},
-  archiveprefix = {arxiv}
+  address = {{Online}}
 }
 
-@article{baLayerNormalization2016,
+@misc{baLayerNormalization2016,
   title = {Layer Normalization},
   author = {Ba, Jimmy Lei and Kiros, Jamie Ryan and Hinton, Geoffrey E.},
   year = {2016},
-  journal = {arXiv:1607.06450},
   eprint = {1607.06450},
-  urldate = {2021-12-28},
   archiveprefix = {arxiv}
 }
 
@@ -276,7 +263,7 @@ @article{Barndorff-Nielsen_2005
 
 @article{barredoarrietaExplainableArtificialIntelligence2020,
   title = {Explainable Artificial Intelligence ({{XAI}}): Concepts, Taxonomies, Opportunities and Challenges toward Responsible {{AI}}},
-  author = {Barredo Arrieta, Alejandro and {Díaz-Rodríguez}, Natalia and Del Ser, Javier and Bennetot, Adrien and Tabik, Siham and Barbado, Alberto and Garcia, Salvador and {Gil-Lopez}, Sergio and Molina, Daniel and Benjamins, Richard and Chatila, Raja and Herrera, Francisco},
+  author = {Barredo Arrieta, Alejandro and {D{\'i}az-Rodr{\'i}guez}, Natalia and Del Ser, Javier and Bennetot, Adrien and Tabik, Siham and Barbado, Alberto and Garcia, Salvador and {Gil-Lopez}, Sergio and Molina, Daniel and Benjamins, Richard and Chatila, Raja and Herrera, Francisco},
   year = {2020},
   journal = {Information Fusion},
   volume = {58},
@@ -308,10 +295,7 @@ @misc{batesCrossvalidationWhatDoes2022
   title = {Cross-Validation: What Does It Estimate and How Well Does It Do It?},
   author = {Bates, Stephen and Hastie, Trevor and Tibshirani, Robert},
   year = {2022},
-  number = {arXiv:2104.00673},
   eprint = {2104.00673},
-  publisher = {{arXiv}},
-  urldate = {2022-10-05},
   archiveprefix = {arxiv}
 }
 
@@ -328,21 +312,23 @@ @article{Battalio_2006
 
 @article{bengioNeuralProbabilisticLanguage,
   title = {A Neural Probabilistic Language Model},
-  author = {Bengio, Yoshua and Ducharme, Réjean and Vincent, Pascal and Jauvin, Christian},
-  year = {2003}
+  author = {Bengio, Yoshua and Ducharme, R{\'e}jean and Vincent, Pascal and Jauvin, Christian},
+  year = {2003},
+  journal = {Journal of Machine Learning Research},
+  volume = {3},
+  number = {6}
 }
 
 @incollection{bengioPracticalRecommendationsGradientBased2012,
   title = {Practical Recommendations for Gradient-Based Training of Deep Architectures},
   booktitle = {Neural {{Networks}}: {{Tricks}} of the {{Trade}}: {{Second Edition}}},
   author = {Bengio, Yoshua},
-  editor = {Montavon, Grégoire and Orr, Geneviève B. and Müller, Klaus-Robert},
+  editor = {Montavon, Gr{\'e}goire and Orr, Genevi{\`e}ve B. and M{\"u}ller, Klaus-Robert},
   year = {2012},
   series = {Lecture {{Notes}} in {{Computer Science}}},
   publisher = {{Springer}},
   address = {{Berlin, Heidelberg}},
-  doi = {10.1007/978-3-642-35289-8_26},
-  urldate = {2022-10-25}
+  doi = {10.1007/978-3-642-35289-8\_26}
 }
 
 @inproceedings{bennettExploitingUnlabeledData2002,
@@ -351,7 +337,7 @@ @inproceedings{bennettExploitingUnlabeledData2002
   author = {Bennett, Kristin P. and Demiriz, Ayhan and Maclin, Richard},
   year = {2002},
   publisher = {{ACM}},
-  address = {{Edmonton Alberta Canada}},
+  address = {{Edmonton, Canada}},
   doi = {10.1145/775047.775090},
   urldate = {2023-04-23}
 }
@@ -442,9 +428,8 @@ @misc{boardofgovernorsofthefederalreservesystemus1YearTreasuryBill1959
   author = {{Board of Governors of the Federal Reserve System (US)}},
   year = {1959},
   journal = {FRED, Federal Reserve Bank of St. Louis},
-  publisher = {{FRED, Federal Reserve Bank of St. Louis}},
-  urldate = {2021-10-31},
-  howpublished = {https://fred.stlouisfed.org/series/DTB1YR}
+  url = {https://fred.stlouisfed.org/series/DTB1YR},
+  urldate = {2021-10-31}
 }
 
 @article{Boehmer_2007,
@@ -458,19 +443,17 @@ @article{Boehmer_2007
   pmid = {null}
 }
 
-@misc{bojanowskiEnrichingWordVectors2017,
-  title = {Enriching Word Vectors with Subword Information},
+@article{bojanowskiEnrichingWordVectors2017,
+  title = {Enriching {{Word Vectors}} with {{Subword Information}}},
   author = {Bojanowski, Piotr and Grave, Edouard and Joulin, Armand and Mikolov, Tomas},
   year = {2017},
-  number = {arXiv:1607.04606},
-  eprint = {1607.04606},
-  publisher = {{arXiv}},
-  urldate = {2023-01-28},
-  archiveprefix = {arxiv}
+  journal = {Transactions of the Association for Computational Linguistics},
+  volume = {5},
+  doi = {10.1162/tacl\_a\_00051}
 }
 
 @article{bojerLearningsKaggleForecasting,
-  title = {Learnings from Kaggle’s Forecasting Competitions},
+  title = {Learnings from Kaggle's Forecasting Competitions},
   author = {Bojer, Casper Solheim and Meldgaard, Jens Peder}
 }
 
@@ -478,22 +461,18 @@ @misc{boleyBetterShortGreedy2021
   title = {Better Short than Greedy: Interpretable Models through Optimal Rule Boosting},
   author = {Boley, Mario and Teshuva, Simon and Bodic, Pierre Le and Webb, Geoffrey I.},
   year = {2021},
-  number = {arXiv:2101.08380},
   eprint = {2101.08380},
-  publisher = {{arXiv}},
+  url = {http://arxiv.org/abs/2101.08380},
   urldate = {2023-02-27},
   archiveprefix = {arxiv}
 }
 
-@misc{borisovDeepNeuralNetworks2022,
-  title = {Deep Neural Networks and Tabular Data: A Survey},
-  author = {Borisov, Vadim and Leemann, Tobias and Seßler, Kathrin and Haug, Johannes and Pawelczyk, Martin and Kasneci, Gjergji},
+@article{borisovDeepNeuralNetworks2022,
+  title = {Deep {{Neural Networks}} and {{Tabular Data}}: {{A Survey}}},
+  author = {Borisov, Vadim and Leemann, Tobias and Sessler, Kathrin and Haug, Johannes and Pawelczyk, Martin and Kasneci, Gjergji},
   year = {2022},
-  number = {arXiv:2110.01889},
-  eprint = {2110.01889},
-  publisher = {{arXiv}},
-  urldate = {2022-10-04},
-  archiveprefix = {arxiv}
+  journal = {IEEE Transactions on Neural Networks and Learning Systems},
+  doi = {10.1109/TNNLS.2022.3229161}
 }
 
 @article{boweNewClassicalBayesian,
@@ -537,7 +516,7 @@ @book{breimanClassificationRegressionTrees2017
   title = {Classification and Regression Trees},
   author = {Breiman, Leo and Friedman, Jerome H. and Olshen, Richard A. and Stone, Charles J.},
   year = {1984},
-  edition = {First},
+  edition = {1},
   publisher = {{CLC Press}},
   address = {{Boca Raton, FL}}
 }
@@ -556,9 +535,7 @@ @misc{breuelEffectsHyperparametersSGD2015
   title = {The Effects of Hyperparameters on {{SGD}} Training of Neural Networks},
   author = {Breuel, Thomas M.},
   year = {2015},
-  number = {arXiv:1508.02788},
   eprint = {1508.02788},
-  publisher = {{arXiv}},
   urldate = {2022-10-25},
   archiveprefix = {arxiv}
 }
@@ -576,15 +553,14 @@ @misc{brownLanguageModelsAre2020
   title = {Language Models Are Few-Shot Learners},
   author = {Brown, Tom B. and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and {Herbert-Voss}, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel M. and Wu, Jeffrey and Winter, Clemens and Hesse, Christopher and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},
   year = {2020},
-  number = {arXiv:2005.14165},
   eprint = {2005.14165},
-  publisher = {{arXiv}},
+  url = {http://arxiv.org/abs/2005.14165},
   urldate = {2023-01-09},
   archiveprefix = {arxiv}
 }
 
 @article{burkeHybridRecommenderSystems2002,
-  title = {Hybrid Recommender Systems: Survey and Experiments†},
+  title = {Hybrid Recommender Systems: Survey and Experiments\textdagger},
   author = {Burke, Robin},
   year = {2002},
   journal = {User Modeling and User-Adapted Interaction},
@@ -611,19 +587,15 @@ @article{caoInformationalContentOption2005
   journal = {The Journal of Business},
   volume = {78},
   number = {3},
-  eprint = {10.1086/429654},
-  eprinttype = {jstor},
-  doi = {10.1086/429654},
-  urldate = {2023-06-23}
+  doi = {10.1086/429654}
 }
 
 @misc{carionEndtoendObjectDetection2020,
   title = {End-to-End Object Detection with Transformers},
   author = {Carion, Nicolas and Massa, Francisco and Synnaeve, Gabriel and Usunier, Nicolas and Kirillov, Alexander and Zagoruyko, Sergey},
   year = {2020},
-  number = {arXiv:2005.12872},
   eprint = {2005.12872},
-  publisher = {{arXiv}},
+  url = {http://arxiv.org/abs/2005.12872},
   urldate = {2023-01-18},
   archiveprefix = {arxiv}
 }
@@ -656,7 +628,7 @@ @article{carvalhoMachineLearningInterpretability2019
 
 @article{cerdaEncodingHighcardinalityString2022,
   title = {Encoding High-Cardinality String Categorical Variables},
-  author = {Cerda, Patricio and Varoquaux, Gaël},
+  author = {Cerda, Patricio and Varoquaux, Ga{\"e}l},
   year = {2022},
   journal = {IEEE Transactions on Knowledge and Data Engineering},
   volume = {34},
@@ -737,9 +709,8 @@ @misc{chanTransformersGeneralizeDifferently2022
   title = {Transformers Generalize Differently from Information Stored in Context vs in Weights},
   author = {Chan, Stephanie C. Y. and Dasgupta, Ishita and Kim, Junkyung and Kumaran, Dharshan and Lampinen, Andrew K. and Hill, Felix},
   year = {2022},
-  number = {arXiv:2210.05675},
   eprint = {2210.05675},
-  publisher = {{arXiv}},
+  url = {http://arxiv.org/abs/2210.05675},
   urldate = {2022-10-18},
   archiveprefix = {arxiv}
 }
@@ -753,11 +724,11 @@ @inproceedings{chapelleSemiSupervisedClassificationLow2005
 
 @book{chapelleSemisupervisedLearning2006,
   title = {Semi-Supervised Learning},
-  editor = {Chapelle, Olivier and Schölkopf, Bernhard and Zien, Alexander},
+  author = {Chapelle, Olivier and Sch{\"o}lkopf, Bernhard and Zien, Alexander},
   year = {2006},
   series = {Adaptive Computation and Machine Learning},
   publisher = {{MIT Press}},
-  address = {{Cambridge, Mass}}
+  address = {{Cambridge, MA}}
 }
 
 @article{chaumUntraceableElectronicMail1981,
@@ -771,15 +742,14 @@ @article{chaumUntraceableElectronicMail1981
   urldate = {2022-10-14}
 }
 
-@misc{cheferGenericAttentionmodelExplainability2021,
-  title = {Generic Attention-Model Explainability for Interpreting Bi-Modal and Encoder-Decoder Transformers},
+@inproceedings{cheferGenericAttentionmodelExplainability2021,
+  title = {Generic {{Attention-model Explainability}} for {{Interpreting Bi-Modal}} and {{Encoder-Decoder Transformers}}},
+  booktitle = {2021 {{IEEE}}/{{CVF International Conference}} on {{Computer Vision}}},
   author = {Chefer, Hila and Gur, Shir and Wolf, Lior},
   year = {2021},
-  number = {arXiv:2103.15679},
-  eprint = {2103.15679},
-  publisher = {{arXiv}},
-  urldate = {2023-01-08},
-  archiveprefix = {arxiv}
+  publisher = {{IEEE}},
+  address = {{Montreal, QC, Canada}},
+  doi = {10.1109/ICCV48922.2021.00045}
 }
 
 @inproceedings{cheferTransformerInterpretabilityAttention2021,
@@ -797,29 +767,26 @@ @misc{chenAlgorithmsEstimateShapley2022
   title = {Algorithms to Estimate {{Shapley}} Value Feature Attributions},
   author = {Chen, Hugh and Covert, Ian C. and Lundberg, Scott M. and Lee, Su-In},
   year = {2022},
-  number = {arXiv:2207.07605},
   eprint = {2207.07605},
-  publisher = {{arXiv}},
+  url = {http://arxiv.org/abs/2207.07605},
   urldate = {2023-05-23},
   archiveprefix = {arxiv}
 }
 
-@misc{chenDebiasedSelfTrainingSemiSupervised2022,
+@inproceedings{chenDebiasedSelfTrainingSemiSupervised2022,
   title = {Debiased Self-Training for Semi-Supervised Learning},
+  booktitle = {Advances in {{Neural Information Processing Systems}}},
   author = {Chen, Baixu and Jiang, Junguang and Wang, Ximei and Wan, Pengfei and Wang, Jianmin and Long, Mingsheng},
   year = {2022},
-  number = {arXiv:2202.07136},
-  eprint = {2202.07136},
-  publisher = {{arXiv}},
-  urldate = {2023-04-05},
-  archiveprefix = {arxiv}
+  volume = {36},
+  publisher = {{Curran Associates, Inc.}},
+  address = {{Long Beach, CA}}
 }
 
 @misc{chenDeepLearningAsset2021,
   title = {Deep Learning in Asset Pricing},
   author = {Chen, Luyang and Pelger, Markus and Zhu, Jason},
   year = {2021},
-  number = {1904.00745},
   eprint = {1904.00745},
   archiveprefix = {arxiv}
 }
@@ -839,9 +806,8 @@ @misc{chenExcelFormerNeuralNetwork2023
   title = {{{ExcelFormer}}: A Neural Network Surpassing Gbdts on Tabular Data},
   author = {Chen, Jintai and Yan, Jiahuan and Chen, Danny Ziyi and Wu, Jian},
   year = {2023},
-  number = {arXiv:2301.02819},
   eprint = {2301.02819},
-  publisher = {{arXiv}},
+  url = {http://arxiv.org/abs/2301.02819},
   urldate = {2023-01-14},
   archiveprefix = {arxiv}
 }
@@ -862,17 +828,15 @@ @inproceedings{chenSimpleFrameworkContrastive2020
   booktitle = {Proceedings of the 37th {{International Conference}} on {{Machine Learning}}},
   author = {Chen, Ting and Kornblith, Simon and Norouzi, Mohammad and Hinton, Geoffrey},
   year = {2020},
-  publisher = {{PMLR}},
-  urldate = {2023-04-24}
+  publisher = {{PMLR}}
 }
 
 @misc{chenTrainingDeepNets2016,
   title = {Training Deep Nets with Sublinear Memory Cost},
   author = {Chen, Tianqi and Xu, Bing and Zhang, Chiyuan and Guestrin, Carlos},
   year = {2016},
-  number = {arXiv:1604.06174},
   eprint = {1604.06174},
-  publisher = {{arXiv}},
+  url = {http://arxiv.org/abs/1604.06174},
   urldate = {2022-11-23},
   archiveprefix = {arxiv}
 }
@@ -881,9 +845,8 @@ @misc{chenTrueModelTrue2020
   title = {True to the Model or True to the Data?},
   author = {Chen, Hugh and Janizek, Joseph D. and Lundberg, Scott and Lee, Su-In},
   year = {2020},
-  number = {arXiv:2006.16234},
   eprint = {2006.16234},
-  publisher = {{arXiv}},
+  url = {http://arxiv.org/abs/2006.16234},
   urldate = {2023-04-09},
   archiveprefix = {arxiv}
 }
@@ -893,9 +856,7 @@ @article{chenXGBoostScalableTree2016
   author = {Chen, Tianqi and Guestrin, Carlos},
   year = {2016},
   journal = {Proceedings of the 22nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining},
-  eprint = {1603.02754},
-  doi = {10.1145/2939672.2939785},
-  archiveprefix = {arxiv}
+  doi = {10.1145/2939672.2939785}
 }
 
 @article{choiEstimationBidAskSpreads1988,
@@ -913,9 +874,8 @@ @misc{cholakovGatedTabTransformerEnhancedDeep2022
   title = {The {{GatedTabTransformer}}. {{An}} Enhanced Deep Learning Architecture for Tabular Modeling},
   author = {Cholakov, Radostin and Kolev, Todor},
   year = {2022},
-  number = {arXiv:2201.00199},
   eprint = {2201.00199},
-  publisher = {{arXiv}},
+  url = {http://arxiv.org/abs/2201.00199},
   urldate = {2023-01-11},
   archiveprefix = {arxiv}
 }
@@ -931,12 +891,12 @@ @article{chordiaIndexOptionTrading2021
   urldate = {2023-06-20}
 }
 
-@article{christianoDeepReinforcementLearning2017,
+@misc{christianoDeepReinforcementLearning2017,
   title = {Deep Reinforcement Learning from Human Preferences},
   author = {Christiano, Paul and Leike, Jan and Brown, Tom B. and Martic, Miljan and Legg, Shane and Amodei, Dario},
   year = {2017},
-  journal = {arXiv:1706.03741 [cs, stat]},
   eprint = {1706.03741},
+  url = {http://arxiv.org/abs/1706.03741},
   urldate = {2021-09-23},
   archiveprefix = {arxiv}
 }
@@ -952,35 +912,29 @@ @article{chuanSuccessAdaBoostIts2021
   urldate = {2022-07-12}
 }
 
-@misc{clarkElectraPretrainingText2020,
-  title = {Electra: Pre-Training Text Encoders as Discriminators Rather than Generators},
+@inproceedings{clarkElectraPretrainingText2020,
+  title = {{{ELECTRA}}: Pre-Training Text Encoders as Discriminators Rather than Generators},
+  booktitle = {International {{Conference}} on {{Learning Representations}}},
   author = {Clark, Kevin and Luong, Minh-Thang and Le, Quoc V. and Manning, Christopher D.},
   year = {2020},
-  number = {arXiv:2003.10555},
-  eprint = {2003.10555},
-  publisher = {{arXiv}},
-  archiveprefix = {arxiv}
+  address = {{Online}}
 }
 
-@misc{clarkWhatDoesBERT2019,
-  title = {What {{Does BERT Look At}}? {{An Analysis}} of {{BERT}}'s {{Attention}}},
+@inproceedings{clarkWhatDoesBERT2019,
+  title = {What Does {{BERT}} Look at? {{An}} Analysis of {{BERT}}'s Attention},
+  booktitle = {Proceedings of the 2019 {{ACL}} Workshop {{BlackboxNLP}}: {{Analyzing}} and Interpreting Neural Networks for {{NLP}}},
   author = {Clark, Kevin and Khandelwal, Urvashi and Levy, Omer and Manning, Christopher D.},
   year = {2019},
-  number = {arXiv:1906.04341},
-  eprint = {1906.04341},
-  publisher = {{arXiv}},
-  urldate = {2023-06-12},
-  archiveprefix = {arxiv}
+  publisher = {{Association for Computational Linguistics}},
+  address = {{Florence, Italy}},
+  doi = {10.18653/v1/W19-4828}
 }
 
 @misc{coenenVisualizingMeasuringGeometry2019,
   title = {Visualizing and {{Measuring}} the {{Geometry}} of {{BERT}}},
-  author = {Coenen, Andy and Reif, Emily and Yuan, Ann and Kim, Been and Pearce, Adam and Viégas, Fernanda and Wattenberg, Martin},
+  author = {Coenen, Andy and Reif, Emily and Yuan, Ann and Kim, Been and Pearce, Adam and Vi{\'e}gas, Fernanda and Wattenberg, Martin},
   year = {2019},
-  number = {arXiv:1906.02715},
   eprint = {1906.02715},
-  publisher = {{arXiv}},
-  urldate = {2023-06-17},
   archiveprefix = {arxiv}
 }
 
@@ -995,7 +949,7 @@ @article{collin-dufresneInformedTradingStock2021
   urldate = {2023-06-26}
 }
 
-@article{congDEEPSEQUENCEMODELING,
+@misc{congDEEPSEQUENCEMODELING,
   title = {Deep Sequence Modeling: Development and Applications in Asset Pricing},
   author = {Cong, Lin William and Tang, Ke and Wang, Jingyuan and Zhang, Yang}
 }
@@ -1012,20 +966,17 @@ @article{Cont_2013
 }
 
 @article{covertExplainingRemovingUnified,
-  title = {Explaining by {{Removing}}: {{A Uniﬁed Framework}} for {{Model Explanation}}},
+  title = {Explaining by {{Removing}}: {{A Unified Framework}} for {{Model Explanation}}},
   author = {Covert, Ian C}
 }
 
-@misc{covertUnderstandingGlobalFeature2020,
+@inproceedings{covertUnderstandingGlobalFeature2020,
   title = {Understanding Global Feature Contributions with Additive Importance Measures},
-  author = {Covert, Ian and Lundberg, Scott and Lee, Su-In},
+  booktitle = {Advances in {{Neural Information Processing Systems}}},
+  author = {Covert, Ian and Lundberg, Scott M and Lee, Su-In},
   year = {2020},
-  number = {arXiv:2004.00668},
-  eprint = {2004.00668},
-  publisher = {{arXiv}},
-  doi = {10.48550/arXiv.2004.00668},
-  urldate = {2023-05-01},
-  archiveprefix = {arxiv}
+  volume = {33},
+  publisher = {{Curran Associates, Inc.}}
 }
 
 @article{cowgillAlgorithmicFairnessEconomics,
@@ -1040,7 +991,7 @@ @inbook{coxExploratoryDataAnalysis2017
   year = {2017},
   publisher = {{Apress}},
   address = {{Berkeley, CA}},
-  doi = {10.1007/978-1-4842-2256-0_3},
+  doi = {10.1007/978-1-4842-2256-0\_3},
   urldate = {2023-01-22},
   collaborator = {Cox, Victoria}
 }
@@ -1062,7 +1013,7 @@ @article{coxRelationForwardPrices1981
 
 @article{creamerAutomatedTradingBoosting2010,
   title = {Automated Trading with Boosting and Expert Weighting},
-  author = {Creamer, Germán and Freund, Yoav},
+  author = {Creamer, Germ{\'a}n and Freund, Yoav},
   year = {2010},
   journal = {Quantitative Finance},
   volume = {10},
@@ -1073,7 +1024,8 @@ @article{creamerAutomatedTradingBoosting2010
 
 @article{crspDATADESCRIPTIONSGUIDE,
   title = {Data Descriptions Guide Crsp Us Stock \& Us Index Databases},
-  author = {{CRSP}}
+  author = {{CRSP}},
+  url = {http://www.crsp.org/files/data_descriptions_guide_0.pdf}
 }
 
 @misc{culurcielloFallRNNLSTM2019,
@@ -1081,8 +1033,8 @@ @misc{culurcielloFallRNNLSTM2019
   author = {Culurciello, Eugenio},
   year = {2019},
   journal = {Medium},
-  urldate = {2021-12-03},
-  howpublished = {https://towardsdatascience.com/the-fall-of-rnn-lstm-2d1594c74ce0}
+  url = {https://towardsdatascience.com/the-fall-of-rnn-lstm-2d1594c74ce0},
+  urldate = {2021-12-03}
 }
 
 @misc{culurcielloMemoryAttentionSequences2018,
@@ -1090,8 +1042,8 @@ @misc{culurcielloMemoryAttentionSequences2018
   author = {Culurciello, Eugenio},
   year = {2018},
   journal = {Medium},
-  urldate = {2021-12-03},
-  howpublished = {https://towardsdatascience.com/memory-attention-sequences-37456d271992}
+  url = {https://towardsdatascience.com/memory-attention-sequences-37456d271992},
+  urldate = {2021-12-03}
 }
 
 @article{daiEmbeddingLearning2022,
@@ -1109,9 +1061,8 @@ @misc{daiTransformerXLAttentiveLanguage2019
   title = {Transformer-{{XL}}: Attentive Language Models beyond a Fixed-Length Context},
   author = {Dai, Zihang and Yang, Zhilin and Yang, Yiming and Carbonell, Jaime and Le, Quoc V. and Salakhutdinov, Ruslan},
   year = {2019},
-  number = {arXiv:1901.02860},
   eprint = {1901.02860},
-  publisher = {{arXiv}},
+  url = {http://arxiv.org/abs/1901.02860},
   urldate = {2023-01-11},
   archiveprefix = {arxiv}
 }
@@ -1119,19 +1070,19 @@ @misc{daiTransformerXLAttentiveLanguage2019
 @inproceedings{dalche-bucSemisupervisedMarginBoost2001,
   title = {Semi-Supervised {{MarginBoost}}},
   booktitle = {Advances in {{Neural Information Processing Systems}}},
-  author = {{d' Alché-Buc}, Florence and Grandvalet, Yves and Ambroise, Christophe},
+  author = {{d' Alch{\'e}-Buc}, Florence and Grandvalet, Yves and Ambroise, Christophe},
   year = {2001},
   volume = {14},
-  publisher = {{MIT Press}}
+  publisher = {{MIT Press}},
+  address = {{Vancouver}}
 }
 
 @misc{darabiContrastiveMixupSelf2021,
   title = {Contrastive Mixup: Self- and Semi-Supervised Learning for Tabular Domain},
   author = {Darabi, Sajad and Fazeli, Shayan and Pazoki, Ali and Sankararaman, Sriram and Sarrafzadeh, Majid},
   year = {2021},
-  number = {arXiv:2108.12296},
   eprint = {2108.12296},
-  publisher = {{arXiv}},
+  url = {http://arxiv.org/abs/2108.12296},
   urldate = {2023-01-24},
   archiveprefix = {arxiv}
 }
@@ -1140,9 +1091,8 @@ @misc{dauphinLanguageModelingGated2017
   title = {Language Modeling with Gated Convolutional Networks},
   author = {Dauphin, Yann N. and Fan, Angela and Auli, Michael and Grangier, David},
   year = {2017},
-  number = {arXiv:1612.08083},
   eprint = {1612.08083},
-  publisher = {{arXiv}},
+  url = {http://arxiv.org/abs/1612.08083},
   urldate = {2023-01-17},
   archiveprefix = {arxiv}
 }
@@ -1159,12 +1109,9 @@ @article{davisGradientBoostingQuantitative2019
 
 @misc{dehghaniUniversalTransformers2019,
   title = {Universal Transformers},
-  author = {Dehghani, Mostafa and Gouws, Stephan and Vinyals, Oriol and Uszkoreit, Jakob and Kaiser, Łukasz},
+  author = {Dehghani, Mostafa and Gouws, Stephan and Vinyals, Oriol and Uszkoreit, Jakob and Kaiser, {\L}ukasz},
   year = {2019},
-  number = {arXiv:1807.03819},
   eprint = {1807.03819},
-  publisher = {{arXiv}},
-  urldate = {2023-01-16},
   archiveprefix = {arxiv}
 }
 
@@ -1180,7 +1127,7 @@ @article{dengStrategicTradingManipulation
 }
 
 @inproceedings{devlinBERTPretrainingDeep2019,
-  title = {Bert: Pre-Training of Deep Bidirectional Transformers for Language Understanding},
+  title = {{{BERT}}: Pre-Training of Deep Bidirectional Transformers for Language Understanding},
   booktitle = {Proceedings of the 2019 {{Conference}} of the {{North American Chapter}} of the {{Association}} for {{Computational Linguistics}}: {{Human Language Technologies}}, {{Volume}} 1 ({{Long}} and {{Short Papers}})},
   author = {Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina},
   year = {2019},
@@ -1220,10 +1167,7 @@ @misc{dosovitskiyImageWorth16x162021
   title = {An Image Is Worth 16x16 Words: Transformers for Image Recognition at Scale},
   author = {Dosovitskiy, Alexey and Beyer, Lucas and Kolesnikov, Alexander and Weissenborn, Dirk and Zhai, Xiaohua and Unterthiner, Thomas and Dehghani, Mostafa and Minderer, Matthias and Heigold, Georg and Gelly, Sylvain and Uszkoreit, Jakob and Houlsby, Neil},
   year = {2021},
-  number = {arXiv:2010.11929},
   eprint = {2010.11929},
-  publisher = {{arXiv}},
-  urldate = {2023-01-30},
   archiveprefix = {arxiv}
 }
 
@@ -1272,7 +1216,7 @@ @article{easleyDiscerningInformationTrade2016
 
 @article{easleyFlowToxicityLiquidity2012,
   title = {Flow Toxicity and Liquidity in a High-Frequency World},
-  author = {Easley, David and {López de Prado}, Marcos M. and O'Hara, Maureen},
+  author = {Easley, David and {L{\'o}pez de Prado}, Marcos M. and O'Hara, Maureen},
   year = {2012},
   journal = {Review of Financial Studies},
   volume = {25},
@@ -1282,8 +1226,8 @@ @article{easleyFlowToxicityLiquidity2012
 }
 
 @article{easleyMicrostructureFlashCrash2011,
-  title = {The Microstructure of the “Flash Crash”: {\emph{Flow Toxicity, Liquidity Crashes, and the Probability of Informed Trading}}},
-  author = {Easley, David and {López de Prado}, Marcos M. and O’Hara, Maureen},
+  title = {The Microstructure of the ``Flash Crash'': {\emph{Flow Toxicity, Liquidity Crashes, and the Probability of Informed Trading}}},
+  author = {Easley, David and {L{\'o}pez de Prado}, Marcos M. and O'Hara, Maureen},
   year = {2011},
   journal = {The Journal of Portfolio Management},
   volume = {37},
@@ -1307,6 +1251,7 @@ @article{elhage2021mathematical
   author = {Elhage, Nelson and Nanda, Neel and Olsson, Catherine and Henighan, Tom and Joseph, Nicholas and Mann, Ben and Askell, Amanda and Bai, Yuntao and Chen, Anna and Conerly, Tom and DasSarma, Nova and Drain, Dawn and Ganguli, Deep and {Hatfield-Dodds}, Zac and Hernandez, Danny and Jones, Andy and Kernion, Jackson and Lovitt, Liane and Ndousse, Kamal and Amodei, Dario and Brown, Tom and Clark, Jack and Kaplan, Jared and McCandlish, Sam and Olah, Chris},
   year = {2021},
   journal = {Transformer Circuits Thread},
+  url = {https://transformer-circuits.pub/2021/framework/index.html},
   urldate = {2023-01-11}
 }
 
@@ -1332,7 +1277,11 @@ @article{enguehardSemiSupervisedLearningDeep2019
 
 @article{erhanWhyDoesUnsupervised,
   title = {Why Does Unsupervised Pre-Training Help Deep Learning?},
-  author = {Erhan, Dumitru and Bengio, Yoshua and Courville, Aaron and Manzagol, Pierre-Antoine and Vincent, Pascal and Bengio, Samy}
+  author = {Erhan, Dumitru and Bengio, Yoshua and Courville, Aaron and Manzagol, Pierre-Antoine and Vincent, Pascal and Bengio, Samy},
+  year = {2010},
+  journal = {Journal of Machine Learning Research},
+  volume = {11},
+  number = {19}
 }
 
 @misc{ExplainingIndividualPredictions,
@@ -1414,13 +1363,13 @@ @misc{federalreservebankofstlouisNBERBasedRecession2022
   author = {{Federal Reserve Bank of St. Louis}},
   year = {2022},
   journal = {FRED, Federal Reserve Bank of St. Louis},
-  urldate = {2022-07-26},
-  howpublished = {https://fred.stlouisfed.org/series/USREC}
+  url = {https://fred.stlouisfed.org/series/USREC},
+  urldate = {2022-07-26}
 }
 
 @article{feldhutterSameBondDifferent2012,
   title = {The Same Bond at Different Prices: Identifying Search Frictions and Selling Pressures},
-  author = {Feldhütter, Peter},
+  author = {Feldh{\"u}tter, Peter},
   year = {2012},
   journal = {The Review of Financial Studies},
   volume = {25},
@@ -1429,13 +1378,11 @@ @article{feldhutterSameBondDifferent2012
   urldate = {2022-12-30}
 }
 
-@article{fengDeepLearningPredicting2018,
+@misc{fengDeepLearningPredicting2018,
   title = {Deep Learning for Predicting Asset Returns},
   author = {Feng, Guanhao and He, Jingyu and Polson, Nicholas G.},
   year = {2018},
-  journal = {arXiv:1804.09314},
   eprint = {1804.09314},
-  urldate = {2021-10-26},
   archiveprefix = {arxiv}
 }
 
@@ -1451,9 +1398,7 @@ @misc{fiedlerSimpleModificationsImprove2021
   title = {Simple Modifications to Improve Tabular Neural Networks},
   author = {Fiedler, James},
   year = {2021},
-  number = {arXiv:2108.03214},
   eprint = {2108.03214},
-  publisher = {{arXiv}},
   archiveprefix = {arxiv}
 }
 
@@ -1468,7 +1413,7 @@ @article{finucaneDirectTestMethods2000
 }
 
 @article{fisherAllModelsAre,
-  title = {All Models Are Wrong, but Many Are Useful: Learning a Variable’s Importance by Studying an Entire Class of Prediction Models Simultaneously},
+  title = {All Models Are Wrong, but Many Are Useful: Learning a Variable's Importance by Studying an Entire Class of Prediction Models Simultaneously},
   author = {Fisher, Aaron and Rudin, Cynthia and Dominici, Francesca}
 }
 
@@ -1549,7 +1494,7 @@ @article{friedmanStochasticGradientBoosting2002
 
 @article{frommelAccuracyTradeClassification2021,
   title = {The Accuracy of Trade Classification Systems on the Foreign Exchange Market: Evidence from the {{RUB}}/{{USD}} Market},
-  author = {Frömmel, Michael and D'Hoore, Dick and Lampaert, Kevin},
+  author = {Fr{\"o}mmel, Michael and D'Hoore, Dick and Lampaert, Kevin},
   year = {2021},
   journal = {Finance Research Letters},
   volume = {42},
@@ -1562,13 +1507,13 @@ @misc{gabrielDynamicPricingUsing2021
   author = {Gabriel, Reslley},
   year = {2021},
   journal = {Medium},
-  urldate = {2022-01-14},
-  howpublished = {https://towardsdatascience.com/dynamic-pricing-using-reinforcement-learning-and-neural-networks-cc3abe374bf5}
+  url = {https://towardsdatascience.com/dynamic-pricing-using-reinforcement-learning-and-neural-networks-cc3abe374bf5},
+  urldate = {2022-01-14}
 }
 
 @article{garleanuDemandBasedOptionPricing2009,
   title = {Demand-Based Option Pricing},
-  author = {Gârleanu, Nicolae and Pedersen, Lasse Heje and Poteshman, Allen M.},
+  author = {G{\^a}rleanu, Nicolae and Pedersen, Lasse Heje and Poteshman, Allen M.},
   year = {2009},
   journal = {Review of Financial Studies},
   volume = {22},
@@ -1580,9 +1525,8 @@ @misc{gevaTransformerFeedforwardLayers2021
   title = {Transformer Feed-Forward Layers Are Key-Value Memories},
   author = {Geva, Mor and Schuster, Roei and Berant, Jonathan and Levy, Omer},
   year = {2021},
-  number = {arXiv:2012.14913},
   eprint = {2012.14913},
-  publisher = {{arXiv}},
+  url = {http://arxiv.org/abs/2012.14913},
   urldate = {2023-01-16},
   archiveprefix = {arxiv}
 }
@@ -1602,10 +1546,10 @@ @inproceedings{glorotDeepSparseRectifier2011
   booktitle = {Proceedings of the {{Fourteenth International Conference}} on {{Artificial Intelligence}} and {{Statistics}}},
   author = {Glorot, Xavier and Bordes, Antoine and Bengio, Yoshua},
   year = {2011},
+  series = {Proceedings of {{Machine Learning Research}}},
   volume = {15},
-  publisher = {{JMLR Workshop and Conference Proceedings}},
-  address = {{Fort Lauderdale, FL}},
-  urldate = {2021-12-08}
+  publisher = {{PMLR}},
+  address = {{Fort Lauderdale, FL}}
 }
 
 @article{Glosten_1988,
@@ -1657,14 +1601,14 @@ @book{goossensLaTeXGraphicsCompanion2008
   address = {{Upper Saddle River, NJ}}
 }
 
-@misc{gorishniyEmbeddingsNumericalFeatures2022,
+@inproceedings{gorishniyEmbeddingsNumericalFeatures2022,
   title = {On Embeddings for Numerical Features in Tabular Deep Learning},
+  booktitle = {Advances in {{Neural Information Processing Systems}}},
   author = {Gorishniy, Yury and Rubachev, Ivan and Babenko, Artem},
+  editor = {Koyejo, S. and Mohamed, S. and Agarwal, A. and Belgrave, D. and Cho, K. and Oh, A.},
   year = {2022},
-  number = {2203.05556},
-  eprint = {2203.05556},
-  publisher = {{arXiv}},
-  archiveprefix = {arxiv}
+  volume = {35},
+  publisher = {{Curran Associates, Inc.}}
 }
 
 @inproceedings{gorishniyRevisitingDeepLearning2021,
@@ -1679,13 +1623,13 @@ @inproceedings{gorishniyRevisitingDeepLearning2021
 
 @misc{GradientBoostPart,
   title = {Gradient Boost Part 1 (of 4): Regression Main Ideas - {{YouTube}}},
-  urldate = {2021-12-25},
-  howpublished = {https://www.youtube.com/watch?v=3CC4N4z3GJc}
+  url = {https://www.youtube.com/watch?v=3CC4N4z3GJc},
+  urldate = {2021-12-25}
 }
 
 @article{grammigDivergingRoadsTheoryBased2020,
   title = {Diverging Roads: Theory-Based vs. Machine Learning-Implied Stock Risk Premia},
-  author = {Grammig, Joachim and Hanenberg, Constantin and Schlag, Christian and Sönksen, Jantje},
+  author = {Grammig, Joachim and Hanenberg, Constantin and Schlag, Christian and S{\"o}nksen, Jantje},
   year = {2020},
   journal = {SSRN Electronic Journal},
   doi = {10.2139/ssrn.3536835},
@@ -1696,15 +1640,15 @@ @misc{grauerOptionTradeClassification2022
   title = {Option Trade Classification},
   author = {Grauer, Caroline and Schuster, Philipp and {Uhrig-Homburg}, Marliese},
   year = {2023},
-  publisher = {{SSRN}},
+  eprint = {ssrn.4098475},
   doi = {10.2139/ssrn.4098475},
   archiveprefix = {SSRN}
 }
 
 @inproceedings{grinsztajnWhyTreebasedModels2022,
   title = {Why Do Tree-Based Models Still Outperform Deep Learning on Typical Tabular Data?},
-  booktitle = {Proceedings of the 36th {{International Conference}} on {{Neural Information Processing Systems}}},
-  author = {Grinsztajn, Léo and Oyallon, Edouard and Varoquaux, Gaël},
+  booktitle = {Advances in {{Neural Information Processing Systems}}},
+  author = {Grinsztajn, L{\'e}o and Oyallon, Edouard and Varoquaux, Ga{\"e}l},
   year = {2022},
   series = {{{NeurIPS}} 2022},
   volume = {36},
@@ -1734,8 +1678,8 @@ @article{guEmpiricalAssetPricing2020
 }
 
 @article{gunnarssonDeepLearningCredit2021,
-  title = {Deep Learning for Credit Scoring: Do or Don’t?},
-  author = {Gunnarsson, Björn Rafn and {vanden Broucke}, Seppe and Baesens, Bart and Óskarsdóttir, María and Lemahieu, Wilfried},
+  title = {Deep Learning for Credit Scoring: Do or Don't?},
+  author = {Gunnarsson, Bj{\"o}rn Rafn and {vanden Broucke}, Seppe and Baesens, Bart and {\'O}skarsd{\'o}ttir, Mar{\'i}a and Lemahieu, Wilfried},
   year = {2021},
   journal = {European Journal of Operational Research},
   volume = {295},
@@ -1759,9 +1703,8 @@ @misc{guoEntityEmbeddingsCategorical2016
   title = {Entity Embeddings of Categorical Variables},
   author = {Guo, Cheng and Berkhahn, Felix},
   year = {2016},
-  number = {arXiv:1604.06737},
   eprint = {1604.06737},
-  publisher = {{arXiv}},
+  url = {http://arxiv.org/abs/1604.06737},
   urldate = {2023-01-25},
   archiveprefix = {arxiv}
 }
@@ -1779,7 +1722,7 @@ @inproceedings{gyamerahStockMarketMovement2019
 
 @article{hagstromerBiasEffectiveBidask2021,
   title = {Bias in the Effective Bid-Ask Spread},
-  author = {Hagströmer, Björn},
+  author = {Hagstr{\"o}mer, Bj{\"o}rn},
   year = {2021},
   journal = {Journal of Financial Economics},
   volume = {142},
@@ -1816,10 +1759,7 @@ @article{harrisDayEndTransactionPrice1989
   journal = {The Journal of Financial and Quantitative Analysis},
   volume = {24},
   number = {1},
-  eprint = {2330746},
-  eprinttype = {jstor},
-  doi = {10.2307/2330746},
-  urldate = {2023-02-05}
+  doi = {10.2307/2330746}
 }
 
 @article{harveyMultivariateStochasticVariance1994,
@@ -1879,6 +1819,7 @@ @inproceedings{hazimehTreeEnsembleLayer2020
   author = {Hazimeh, Hussein and Ponomareva, Natalia and Mol, Petros and Tan, Zhenyu and Mazumder, Rahul},
   year = {2020},
   publisher = {{PMLR}},
+  url = {https://proceedings.mlr.press/v119/hazimeh20a.html},
   urldate = {2023-05-15}
 }
 
@@ -1887,18 +1828,15 @@ @inproceedings{heatonEmpiricalAnalysisFeature2016
   booktitle = {{{SoutheastCon}} 2016},
   author = {Heaton, Jeff},
   year = {2016},
-  eprint = {1701.07852},
-  doi = {10.1109/SECON.2016.7506650},
-  archiveprefix = {arxiv}
+  doi = {10.1109/SECON.2016.7506650}
 }
 
 @misc{heBagTricksImage2018,
   title = {Bag of Tricks for Image Classification with Convolutional Neural Networks},
   author = {He, Tong and Zhang, Zhi and Zhang, Hang and Zhang, Zhongyue and Xie, Junyuan and Li, Mu},
   year = {2018},
-  number = {arXiv:1812.01187},
   eprint = {1812.01187},
-  publisher = {{arXiv}},
+  url = {http://arxiv.org/abs/1812.01187},
   urldate = {2022-10-26},
   archiveprefix = {arxiv}
 }
@@ -1907,10 +1845,7 @@ @misc{heDeepResidualLearning2015
   title = {Deep Residual Learning for Image Recognition},
   author = {He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
   year = {2015},
-  number = {arXiv:1512.03385},
   eprint = {1512.03385},
-  publisher = {{arXiv}},
-  urldate = {2023-01-11},
   archiveprefix = {arxiv}
 }
 
@@ -1923,21 +1858,17 @@ @misc{hegselmannTabLLMFewshotClassification2022
   title = {{{TabLLM}}: Few-Shot Classification of Tabular Data with Large Language Models},
   author = {Hegselmann, Stefan and Buendia, Alejandro and Lang, Hunter and Agrawal, Monica and Jiang, Xiaoyi and Sontag, David},
   year = {2022},
-  number = {arXiv:2210.10723},
   eprint = {2210.10723},
-  publisher = {{arXiv}},
+  url = {http://arxiv.org/abs/2210.10723},
   urldate = {2023-01-14},
   archiveprefix = {arxiv}
 }
 
 @misc{hendrycksGaussianErrorLinear2020,
-  title = {Gaussian Error Linear Units (Gelus)},
+  title = {Gaussian {{Error Linear Units}} ({{GELUs}})},
   author = {Hendrycks, Dan and Gimpel, Kevin},
   year = {2020},
-  number = {arXiv:1606.08415},
   eprint = {1606.08415},
-  publisher = {{arXiv}},
-  urldate = {2023-01-16},
   archiveprefix = {arxiv}
 }
 
@@ -1945,16 +1876,15 @@ @misc{hertzPrompttopromptImageEditing2022
   title = {Prompt-to-Prompt Image Editing with Cross Attention Control},
   author = {Hertz, Amir and Mokady, Ron and Tenenbaum, Jay and Aberman, Kfir and Pritch, Yael and {Cohen-Or}, Daniel},
   year = {2022},
-  number = {arXiv:2208.01626},
   eprint = {2208.01626},
-  publisher = {{arXiv}},
+  url = {http://arxiv.org/abs/2208.01626},
   urldate = {2022-09-12},
   archiveprefix = {arxiv}
 }
 
 @article{hidasiRecurrentNeuralNetworks2018,
   title = {Recurrent Neural Networks with Top-k Gains for Session-Based Recommendations},
-  author = {Hidasi, Balázs and Karatzoglou, Alexandros},
+  author = {Hidasi, Bal{\'a}zs and Karatzoglou, Alexandros},
   year = {2018},
   journal = {Proceedings of the 27th ACM International Conference on Information and Knowledge Management},
   eprint = {1706.03847},
@@ -1986,19 +1916,19 @@ @article{hoangMachineLearningMethods
   author = {Hoang, Daniel and Wiegratz, Kevin}
 }
 
-@article{hoAxialAttentionMultidimensional2019,
+@misc{hoAxialAttentionMultidimensional2019,
   title = {Axial Attention in Multidimensional Transformers},
   author = {Ho, Jonathan and Kalchbrenner, Nal and Weissenborn, Dirk and Salimans, Tim},
   year = {2019},
-  journal = {arXiv:1912.12180 [cs]},
   eprint = {1912.12180},
+  url = {http://arxiv.org/abs/1912.12180},
   urldate = {2022-01-04},
   archiveprefix = {arxiv}
 }
 
 @article{hochreiterLongShorttermMemory1997,
   title = {Long Short-Term Memory},
-  author = {Hochreiter, Sepp and Schmidhuber, Jürgen},
+  author = {Hochreiter, Sepp and Schmidhuber, J{\"u}rgen},
   year = {1997},
   journal = {Neural Computation},
   volume = {9},
@@ -2011,10 +1941,7 @@ @misc{hoffmannTrainingComputeOptimalLarge2022
   title = {Training {{Compute-Optimal Large Language Models}}},
   author = {Hoffmann, Jordan and Borgeaud, Sebastian and Mensch, Arthur and Buchatskaya, Elena and Cai, Trevor and Rutherford, Eliza and Casas, Diego de Las and Hendricks, Lisa Anne and Welbl, Johannes and Clark, Aidan and Hennigan, Tom and Noland, Eric and Millican, Katie and Driessche, George van den and Damoc, Bogdan and Guy, Aurelia and Osindero, Simon and Simonyan, Karen and Elsen, Erich and Rae, Jack W. and Vinyals, Oriol and Sifre, Laurent},
   year = {2022},
-  number = {arXiv:2203.15556},
   eprint = {2203.15556},
-  publisher = {{arXiv}},
-  urldate = {2023-05-31},
   archiveprefix = {arxiv}
 }
 
@@ -2041,7 +1968,7 @@ @article{holthausenEffectLargeBlock1987
 
 @book{holzingerXxAIExplainableAI2022,
   title = {{{xxAI}} - {{Beyond Explainable AI}}: {{International Workshop}}, {{Held}} in {{Conjunction}} with {{ICML}} 2020, {{July}} 18, 2020, {{Vienna}}, {{Austria}}, {{Revised}} and {{Extended Papers}}},
-  editor = {Holzinger, Andreas and Goebel, Randy and Fong, Ruth and Moon, Taesup and Müller, Klaus-Robert and Samek, Wojciech},
+  editor = {Holzinger, Andreas and Goebel, Randy and Fong, Ruth and Moon, Taesup and M{\"u}ller, Klaus-Robert and Samek, Wojciech},
   year = {2022},
   series = {Lecture {{Notes}} in {{Computer Science}}},
   volume = {13200},
@@ -2077,10 +2004,7 @@ @misc{huangSnapshotEnsemblesTrain2017
   title = {Snapshot Ensembles: Train 1, Get {{M}} for Free},
   author = {Huang, Gao and Li, Yixuan and Pleiss, Geoff and Liu, Zhuang and Hopcroft, John E. and Weinberger, Kilian Q.},
   year = {2017},
-  number = {arXiv:1704.00109},
   eprint = {1704.00109},
-  publisher = {{arXiv}},
-  urldate = {2021-08-17},
   archiveprefix = {arxiv}
 }
 
@@ -2088,9 +2012,7 @@ @misc{huangTabTransformerTabularData2020
   title = {{{TabTransformer}}: Tabular Data Modeling Using Contextual Embeddings},
   author = {Huang, Xin and Khetan, Ashish and Cvitkovic, Milan and Karnin, Zohar},
   year = {2020},
-  number = {2012.06678},
   eprint = {2012.06678},
-  publisher = {{arXiv}},
   archiveprefix = {arxiv}
 }
 
@@ -2122,7 +2044,7 @@ @book{hyndmanForecastingPrinciplesPractice2021
   title = {Forecasting: Principles and Practice},
   author = {Hyndman, Rob J and Athanasopoulos, George},
   year = {2021},
-  edition = {Third}
+  edition = {3}
 }
 
 @article{inceINDIVIDUALEQUITYRETURN2006,
@@ -2146,10 +2068,7 @@ @misc{ioffeBatchNormalizationAccelerating2015
   title = {Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift},
   author = {Ioffe, Sergey and Szegedy, Christian},
   year = {2015},
-  number = {arXiv:1502.03167},
   eprint = {1502.03167},
-  publisher = {{arXiv}},
-  urldate = {2023-01-01},
   archiveprefix = {arxiv}
 }
 
@@ -2168,9 +2087,8 @@ @misc{ivanovDataMovementAll2021
   title = {Data Movement Is All You Need: A Case Study on Optimizing Transformers},
   author = {Ivanov, Andrei and Dryden, Nikoli and {Ben-Nun}, Tal and Li, Shigang and Hoefler, Torsten},
   year = {2021},
-  number = {arXiv:2007.00072},
   eprint = {2007.00072},
-  publisher = {{arXiv}},
+  url = {http://arxiv.org/abs/2007.00072},
   urldate = {2023-01-15},
   archiveprefix = {arxiv}
 }
@@ -2179,9 +2097,8 @@ @misc{izmailovAveragingWeightsLeads2019
   title = {Averaging Weights Leads to Wider Optima and Better Generalization},
   author = {Izmailov, Pavel and Podoprikhin, Dmitrii and Garipov, Timur and Vetrov, Dmitry and Wilson, Andrew Gordon},
   year = {2019},
-  number = {arXiv:1803.05407},
   eprint = {1803.05407},
-  publisher = {{arXiv}},
+  url = {http://arxiv.org/abs/1803.05407},
   urldate = {2022-10-13},
   archiveprefix = {arxiv}
 }
@@ -2201,9 +2118,7 @@ @misc{jainAttentionNotExplanation2019
   title = {Attention Is Not Explanation},
   author = {Jain, Sarthak and Wallace, Byron C.},
   year = {2019},
-  number = {arXiv:1902.10186},
   eprint = {1902.10186},
-  publisher = {{arXiv}},
   archiveprefix = {arxiv}
 }
 
@@ -2220,7 +2135,7 @@ @article{japkowiczClassImbalanceProblem2002
 @inproceedings{jawaharWhatDoesBERT2019,
   title = {What Does {{BERT}} Learn about the Structure of Language?},
   booktitle = {Proceedings of the 57th {{Annual Meeting}} of the {{Association}} for {{Computational Linguistics}}},
-  author = {Jawahar, Ganesh and Sagot, Benoît and Seddah, Djamé},
+  author = {Jawahar, Ganesh and Sagot, Beno{\^i}t and Seddah, Djam{\'e}},
   year = {2019},
   publisher = {{Association for Computational Linguistics}},
   address = {{Florence, Italy}},
@@ -2240,9 +2155,8 @@ @misc{jinPruningEffectGeneralization2022
   title = {Pruning's Effect on Generalization through the Lens of Training and Regularization},
   author = {Jin, Tian and Carbin, Michael and Roy, Daniel M. and Frankle, Jonathan and Dziugaite, Gintare Karolina},
   year = {2022},
-  number = {arXiv:2210.13738},
   eprint = {2210.13738},
-  publisher = {{arXiv}},
+  url = {http://arxiv.org/abs/2210.13738},
   urldate = {2023-01-15},
   archiveprefix = {arxiv}
 }
@@ -2260,11 +2174,9 @@ @article{johnsonSurveyDeepLearning2019
 
 @misc{josseConsistencySupervisedLearning2020,
   title = {On the Consistency of Supervised Learning with Missing Values},
-  author = {Josse, Julie and Prost, Nicolas and Scornet, Erwan and Varoquaux, Gaël},
+  author = {Josse, Julie and Prost, Nicolas and Scornet, Erwan and Varoquaux, Ga{\"e}l},
   year = {2020},
-  number = {arXiv:1902.06931},
   eprint = {1902.06931},
-  publisher = {{arXiv}},
   archiveprefix = {arxiv}
 }
 
@@ -2290,7 +2202,7 @@ @article{kadanBoundExpectedStock2020
 
 @inproceedings{kadraWelltunedSimpleNets2021,
   title = {Well-Tuned Simple Nets Excel on Tabular Datasets},
-  booktitle = {{{NeurIPS}} 2021},
+  booktitle = {Advances in {{Neural Information Processing Systems}}},
   author = {Kadra, Arlind and Lindauer, Marius and Hutter, Frank and Grabocka, Josif},
   year = {2021},
   volume = {34},
@@ -2298,7 +2210,7 @@ @inproceedings{kadraWelltunedSimpleNets2021
 }
 
 @article{kaeckPriceImpactBid2022,
-  title = {Price Impact versus Bid–Ask Spreads in the Index Option Market},
+  title = {Price Impact versus Bid\textendash Ask Spreads in the Index Option Market},
   author = {Kaeck, Andreas and Van Kervel, Vincent and Seeger, Norman J.},
   year = {2022},
   journal = {Journal of Financial Markets},
@@ -2338,8 +2250,7 @@ @inproceedings{keLightGBMHighlyEfficient2017
   author = {Ke, Guolin and Meng, Qi and Finley, Thomas and Wang, Taifeng and Chen, Wei and Ma, Weidong and Ye, Qiwei and Liu, Tie-Yan},
   year = {2017},
   volume = {30},
-  publisher = {{Curran Associates, Inc.}},
-  urldate = {2022-12-15}
+  publisher = {{Curran Associates, Inc.}}
 }
 
 @article{kellyCharacteristicsAreCovariances2019,
@@ -2352,13 +2263,12 @@ @article{kellyCharacteristicsAreCovariances2019
   doi = {10.1016/j.jfineco.2019.05.001}
 }
 
-@article{keskarLargeBatchTrainingDeep2017,
-  title = {On Large-Batch Training for Deep Learning: Generalization Gap and Sharp Minima},
+@inproceedings{keskarLargeBatchTrainingDeep2017,
+  title = {On Large-Batch Training for Deep Learning: {{Generalization}} Gap and Sharp Minima},
+  booktitle = {5th International Conference on Learning Representations},
   author = {Keskar, Nitish Shirish and Mudigere, Dheevatsa and Nocedal, Jorge and Smelyanskiy, Mikhail and Tang, Ping Tak Peter},
   year = {2017},
-  journal = {arXiv: 1609.04836},
-  eprint = {1609.04836},
-  archiveprefix = {arxiv}
+  address = {{Toulon}}
 }
 
 @article{khorramEndtoendCNNLSTM2021,
@@ -2383,15 +2293,12 @@ @article{kichererSeamlesslyPortableApplications2012
   urldate = {2021-02-20}
 }
 
-@misc{kitaevReformerEfficientTransformer2020,
+@inproceedings{kitaevReformerEfficientTransformer2020,
   title = {Reformer: The Efficient Transformer},
-  author = {Kitaev, Nikita and Kaiser, Łukasz and Levskaya, Anselm},
+  booktitle = {8th {{International Conference}} on {{Learning Representations}}},
+  author = {Kitaev, Nikita and Kaiser, {\L}ukasz and Levskaya, Anselm},
   year = {2020},
-  number = {arXiv:2001.04451},
-  eprint = {2001.04451},
-  publisher = {{arXiv}},
-  urldate = {2023-01-16},
-  archiveprefix = {arxiv}
+  address = {{Addis Ababa}}
 }
 
 @misc{klingenbrunnTransformerImplementationTimeseries2021,
@@ -2399,6 +2306,7 @@ @misc{klingenbrunnTransformerImplementationTimeseries2021
   author = {Klingenbrunn, Natasha},
   year = {2021},
   journal = {MLearning.ai},
+  url = {https://medium.com/mlearning-ai/transformer-implementation-for-time-series-forecasting-a9db2db5c820},
   urldate = {2021-11-06}
 }
 
@@ -2423,15 +2331,16 @@ @article{kraussDeepNeuralNetworks2017
 
 @article{krogerKapitelOutlierDetection,
   title = {{Kapitel 6: outlier detection}},
-  author = {Kröger, Peer and Zimek, Arthur}
+  author = {Kr{\"o}ger, Peer and Zimek, Arthur}
 }
 
 @article{kuhlHumanVsSupervised2020,
   title = {Human vs. Supervised Machine Learning: Who Learns Patterns Faster?},
-  author = {Kühl, Niklas and Goutier, Marc and Baier, Lucas and Wolff, Clemens and Martin, Dominik},
+  author = {K{\"u}hl, Niklas and Goutier, Marc and Baier, Lucas and Wolff, Clemens and Martin, Dominik},
   year = {2020},
   journal = {arXiv:2012.03661 [cs]},
   eprint = {2012.03661},
+  url = {http://arxiv.org/abs/2012.03661},
   urldate = {2021-11-27},
   archiveprefix = {arxiv}
 }
@@ -2452,11 +2361,10 @@ @article{lambertonIntroductionStochasticCalculus
 
 @misc{lampleLargeMemoryLayers2019,
   title = {Large Memory Layers with Product Keys},
-  author = {Lample, Guillaume and Sablayrolles, Alexandre and Ranzato, Marc'Aurelio and Denoyer, Ludovic and Jégou, Hervé},
+  author = {Lample, Guillaume and Sablayrolles, Alexandre and Ranzato, Marc'Aurelio and Denoyer, Ludovic and J{\'e}gou, Herv{\'e}},
   year = {2019},
-  number = {arXiv:1907.05242},
   eprint = {1907.05242},
-  publisher = {{arXiv}},
+  url = {http://arxiv.org/abs/1907.05242},
   urldate = {2023-01-16},
   archiveprefix = {arxiv}
 }
@@ -2464,13 +2372,13 @@ @misc{lampleLargeMemoryLayers2019
 @incollection{lecunEfficientBackProp2012,
   title = {Efficient {{BackProp}}},
   booktitle = {Neural {{Networks}}: {{Tricks}} of the {{Trade}}},
-  author = {LeCun, Yann A. and Bottou, Léon and Orr, Genevieve B. and Müller, Klaus-Robert},
-  editor = {Montavon, Grégoire and Orr, Geneviève B. and Müller, Klaus-Robert},
+  author = {LeCun, Yann A. and Bottou, L{\'e}on and Orr, Genevieve B. and M{\"u}ller, Klaus-Robert},
+  editor = {Montavon, Gr{\'e}goire and Orr, Genevi{\`e}ve B. and M{\"u}ller, Klaus-Robert},
   year = {2012},
   volume = {7700},
   publisher = {{Springer Berlin Heidelberg}},
   address = {{Berlin, Heidelberg}},
-  doi = {10.1007/978-3-642-35289-8_3}
+  doi = {10.1007/978-3-642-35289-8\_3}
 }
 
 @article{leeInferringInvestorBehavior2000,
@@ -2527,6 +2435,7 @@ @inproceedings{leeSetTransformerFramework2019
   author = {Lee, Juho and Lee, Yoonho and Kim, Jungtaek and Kosiorek, Adam and Choi, Seungjin and Teh, Yee Whye},
   year = {2019},
   publisher = {{PMLR}},
+  url = {https://proceedings.mlr.press/v97/lee19d.html},
   urldate = {2023-04-20}
 }
 
@@ -2540,11 +2449,12 @@ @article{leitchEconomicForecastEvaluation1991
   eprint = {2006520},
   eprinttype = {jstor},
   publisher = {{American Economic Association}},
+  url = {http://www.jstor.org/stable/2006520},
   urldate = {2021-12-22}
 }
 
 @inproceedings{lemorvanWhatGoodImputation2021,
-  title = {What’s a Good Imputation to Predict with Missing Values?},
+  title = {What's a Good Imputation to Predict with Missing Values?},
   booktitle = {Advances in {{Neural Information Processing Systems}}},
   author = {Le Morvan, Marine and Josse, Julie and Scornet, Erwan and Varoquaux, Gael},
   year = {2021},
@@ -2556,9 +2466,7 @@ @misc{levinTransferLearningDeep2022
   title = {Transfer Learning with Deep Tabular Models},
   author = {Levin, Roman and Cherepanova, Valeriia and Schwarzschild, Avi and Bansal, Arpit and Bruss, C. Bayan and Goldstein, Tom and Wilson, Andrew Gordon and Goldblum, Micah},
   year = {2022},
-  number = {2206.15306},
   eprint = {2206.15306},
-  publisher = {{arXiv}},
   archiveprefix = {arxiv}
 }
 
@@ -2609,9 +2517,8 @@ @misc{linSurveyTransformers2021
   title = {A Survey of Transformers},
   author = {Lin, Tianyang and Wang, Yuxin and Liu, Xiangyang and Qiu, Xipeng},
   year = {2021},
-  number = {arXiv:2106.04554},
   eprint = {2106.04554},
-  publisher = {{arXiv}},
+  url = {http://arxiv.org/abs/2106.04554},
   urldate = {2022-12-04},
   archiveprefix = {arxiv}
 }
@@ -2626,15 +2533,14 @@ @article{linWhyOptionsPrices2015
   urldate = {2022-07-12}
 }
 
-@misc{liptonMythosModelInterpretability2017,
-  title = {The Mythos of Model Interpretability},
+@article{liptonMythosModelInterpretability2017,
+  title = {The {{Mythos}} of {{Model Interpretability}}: {{In}} Machine Learning, the Concept of Interpretability Is Both Important and Slippery.},
   author = {Lipton, Zachary C.},
-  year = {2017},
-  number = {arXiv:1606.03490},
-  eprint = {1606.03490},
-  publisher = {{arXiv}},
-  urldate = {2023-02-21},
-  archiveprefix = {arxiv}
+  year = {2018},
+  journal = {Queue},
+  volume = {16},
+  number = {3},
+  doi = {10.1145/3236386.3241340}
 }
 
 @article{littleStatisticalAnalysisMissing,
@@ -2664,9 +2570,8 @@ @misc{liuMonolithRealTime2022
   title = {Monolith: Real Time Recommendation System with Collisionless Embedding Table},
   author = {Liu, Zhuoran and Zou, Leqi and Zou, Xuan and Wang, Caihua and Zhang, Biao and Tang, Da and Zhu, Bolin and Zhu, Yijie and Wu, Peng and Wang, Ke and Cheng, Youlong},
   year = {2022},
-  number = {arXiv:2209.07663},
   eprint = {2209.07663},
-  publisher = {{arXiv}},
+  url = {http://arxiv.org/abs/2209.07663},
   urldate = {2022-11-01},
   archiveprefix = {arxiv}
 }
@@ -2675,9 +2580,8 @@ @misc{liuPayAttentionMlps2021
   title = {Pay Attention to Mlps},
   author = {Liu, Hanxiao and Dai, Zihang and So, David R. and Le, Quoc V.},
   year = {2021},
-  number = {arXiv:2105.08050},
   eprint = {2105.08050},
-  publisher = {{arXiv}},
+  url = {http://arxiv.org/abs/2105.08050},
   urldate = {2022-12-07},
   archiveprefix = {arxiv}
 }
@@ -2706,9 +2610,8 @@ @misc{liuRoBERTaRobustlyOptimized2019
   title = {Roberta: A Robustly Optimized Bert Pretraining Approach},
   author = {Liu, Yinhan and Ott, Myle and Goyal, Naman and Du, Jingfei and Joshi, Mandar and Chen, Danqi and Levy, Omer and Lewis, Mike and Zettlemoyer, Luke and Stoyanov, Veselin},
   year = {2019},
-  number = {arXiv:1907.11692},
   eprint = {1907.11692},
-  publisher = {{arXiv}},
+  url = {http://arxiv.org/abs/1907.11692},
   urldate = {2023-01-13},
   archiveprefix = {arxiv}
 }
@@ -2724,24 +2627,22 @@ @inproceedings{liuSTAMPShorttermAttention2018
   urldate = {2021-05-04}
 }
 
-@misc{liuUnderstandingDifficultyTraining2020,
+@inproceedings{liuUnderstandingDifficultyTraining2020,
   title = {Understanding the Difficulty of Training Transformers},
+  booktitle = {Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing ({{EMNLP}})},
   author = {Liu, Liyuan and Liu, Xiaodong and Gao, Jianfeng and Chen, Weizhu and Han, Jiawei},
   year = {2020},
-  number = {arXiv:2004.08249},
-  eprint = {2004.08249},
-  publisher = {{arXiv}},
-  urldate = {2023-01-14},
-  archiveprefix = {arxiv}
+  publisher = {{Association for Computational Linguistics}},
+  address = {{Online}},
+  doi = {10.18653/v1/2020.emnlp-main.463}
 }
 
 @misc{liuVarianceAdaptiveLearning2021,
   title = {On the Variance of the Adaptive Learning Rate and Beyond},
   author = {Liu, Liyuan and Jiang, Haoming and He, Pengcheng and Chen, Weizhu and Liu, Xiaodong and Gao, Jianfeng and Han, Jiawei},
   year = {2021},
-  number = {arXiv:1908.03265},
   eprint = {1908.03265},
-  publisher = {{arXiv}},
+  url = {http://arxiv.org/abs/1908.03265},
   urldate = {2023-01-09},
   archiveprefix = {arxiv}
 }
@@ -2761,39 +2662,33 @@ @misc{lonesHowAvoidMachine2022
   title = {How to Avoid Machine Learning Pitfalls: A Guide for Academic Researchers},
   author = {Lones, Michael A.},
   year = {2022},
-  number = {arXiv:2108.02497},
   eprint = {2108.02497},
-  publisher = {{arXiv}},
+  url = {http://arxiv.org/abs/2108.02497},
   urldate = {2022-11-06},
   archiveprefix = {arxiv}
 }
 
 @book{lopezdepradoAdvancesFinancialMachine2018,
   title = {Advances in Financial Machine Learning},
-  author = {{López de Prado}, Marcos},
+  author = {{L{\'o}pez de Prado}, Marcos},
   year = {2018},
   publisher = {{Wiley}},
   address = {{Hoboken, NJ}}
 }
 
-@misc{loshchilovDecoupledWeightDecay2019,
+@inproceedings{loshchilovDecoupledWeightDecay2019,
   title = {Decoupled Weight Decay Regularization},
+  booktitle = {7th {{International Conference}} on {{Learning Representations}}},
   author = {Loshchilov, Ilya and Hutter, Frank},
   year = {2019},
-  number = {arXiv:1711.05101},
-  eprint = {1711.05101},
-  publisher = {{arXiv}},
-  urldate = {2023-01-09},
-  archiveprefix = {arxiv}
+  address = {{New Orleans, LA, USA}}
 }
 
 @misc{loshchilovSGDRStochasticGradient2017,
   title = {{{SGDR}}: Stochastic Gradient Descent with Warm Restarts},
   author = {Loshchilov, Ilya and Hutter, Frank},
   year = {2017},
-  number = {arXiv:1608.03983},
   eprint = {1608.03983},
-  publisher = {{arXiv}},
   archiveprefix = {arxiv}
 }
 
@@ -2814,9 +2709,7 @@ @misc{lundbergConsistentIndividualizedFeature2019
   title = {Consistent Individualized Feature Attribution for Tree Ensembles},
   author = {Lundberg, Scott M. and Erion, Gabriel G. and Lee, Su-In},
   year = {2019},
-  number = {arXiv:1802.03888},
   eprint = {1802.03888},
-  publisher = {{arXiv}},
   archiveprefix = {arxiv}
 }
 
@@ -2845,18 +2738,15 @@ @inproceedings{luoCollaborativeSelfattentionNetwork2020
 @misc{MachineLearningHow,
   title = {Machine Learning - How to Intuitively Explain What a Kernel Is?},
   journal = {Cross Validated},
-  urldate = {2021-08-14},
-  howpublished = {https://stats.stackexchange.com/questions/152897/how-to-intuitively-explain-what-a-kernel-is}
+  url = {https://stats.stackexchange.com/questions/152897/how-to-intuitively-explain-what-a-kernel-is},
+  urldate = {2021-08-14}
 }
 
 @misc{malininUncertaintyGradientBoosting2021,
   title = {Uncertainty in Gradient Boosting via Ensembles},
   author = {Malinin, Andrey and Prokhorenkova, Liudmila and Ustimenko, Aleksei},
   year = {2021},
-  number = {arXiv:2006.10562},
   eprint = {2006.10562},
-  publisher = {{arXiv}},
-  urldate = {2022-12-03},
   archiveprefix = {arxiv}
 }
 
@@ -2889,7 +2779,7 @@ @article{mankowitzFasterSortingAlgorithms
 
 @article{maraisDeepLearningTabular,
   title = {Deep Learning for Tabular Data: An Exploratory Study},
-  author = {Marais, Jan André}
+  author = {Marais, Jan Andr{\'e}}
 }
 
 @book{martinEconometricModellingTime2012,
@@ -2953,6 +2843,16 @@ @article{mayhewCompetitionMarketStructure2002
   urldate = {2023-06-22}
 }
 
+@misc{mccoyBERTsFeatherNot2020,
+  title = {{{BERTs}} of a Feather Do Not Generalize Together: {{Large}} Variability in Generalization across Models with Similar Test Set Performance},
+  author = {McCoy, R. Thomas and Min, Junghyun and Linzen, Tal},
+  year = {2020},
+  eprint = {1911.02969},
+  url = {http://arxiv.org/abs/1911.02969},
+  urldate = {2023-07-04},
+  archiveprefix = {arxiv}
+}
+
 @article{mcnemarNoteSamplingError1947,
   title = {Note on the Sampling Error of the Difference between Correlated Proportions or Percentages},
   author = {McNemar, Quinn},
@@ -2971,22 +2871,28 @@ @article{measeBoostedClassificationTrees
 
 @misc{melisStateArtEvaluation2017,
   title = {On the State of the Art of Evaluation in Neural Language Models},
-  author = {Melis, Gábor and Dyer, Chris and Blunsom, Phil},
+  author = {Melis, G{\'a}bor and Dyer, Chris and Blunsom, Phil},
   year = {2017},
-  number = {arXiv:1707.05589},
   eprint = {1707.05589},
-  publisher = {{arXiv}},
+  url = {http://arxiv.org/abs/1707.05589},
   urldate = {2022-10-26},
   archiveprefix = {arxiv}
 }
 
+@misc{merchantWhatHappensBERT2020,
+  title = {What {{Happens To BERT Embeddings During Fine-tuning}}?},
+  author = {Merchant, Amil and Rahimtoroghi, Elahe and Pavlick, Ellie and Tenney, Ian},
+  year = {2020},
+  eprint = {2004.14448},
+  archiveprefix = {arxiv}
+}
+
 @misc{meyesAblationStudiesArtificial2019,
   title = {Ablation Studies in Artificial Neural Networks},
   author = {Meyes, Richard and Lu, Melanie and {de Puiseau}, Constantin Waubert and Meisen, Tobias},
   year = {2019},
-  number = {arXiv:1901.08644},
   eprint = {1901.08644},
-  publisher = {{arXiv}},
+  url = {http://arxiv.org/abs/1901.08644},
   urldate = {2022-12-15},
   archiveprefix = {arxiv}
 }
@@ -2997,8 +2903,7 @@ @inproceedings{michelAreSixteenHeads2019
   author = {Michel, Paul and Levy, Omer and Neubig, Graham},
   year = {2019},
   volume = {32},
-  publisher = {{Curran Associates, Inc.}},
-  urldate = {2023-01-12}
+  publisher = {{Curran Associates, Inc.}}
 }
 
 @inproceedings{mikolovLinguisticRegularitiesContinuous2013,
@@ -3007,8 +2912,7 @@ @inproceedings{mikolovLinguisticRegularitiesContinuous2013
   author = {Mikolov, Tomas and Yih, Wen-tau and Zweig, Geoffrey},
   year = {2013},
   publisher = {{Association for Computational Linguistics}},
-  address = {{Atlanta, Georgia}},
-  urldate = {2023-01-10}
+  address = {{Atlanta, GE}}
 }
 
 @misc{mirzaeiHowUseDeepLearning2019,
@@ -3016,6 +2920,7 @@ @misc{mirzaeiHowUseDeepLearning2019
   author = {Mirzaei, Ali},
   year = {2019},
   journal = {Medium},
+  url = {https://medium.com/@a.mirzaei69/how-to-use-deep-learning-for-feature-selection-python-keras-24a68bef1e33},
   urldate = {2021-11-05}
 }
 
@@ -3035,6 +2940,7 @@ @misc{modelerSHAPNotAll2023
   author = {Modeler, Mindful},
   year = {2023},
   journal = {Mindful Modeler},
+  url = {https://mindfulmodeler.substack.com/p/shap-is-not-all-you-need},
   urldate = {2023-03-26}
 }
 
@@ -3050,11 +2956,10 @@ @article{mogharStockMarketPrediction2020
 
 @misc{molnarRelatingPartialDependence2021,
   title = {Relating the Partial Dependence Plot and Permutation Feature Importance to the Data Generating Process},
-  author = {Molnar, Christoph and Freiesleben, Timo and König, Gunnar and Casalicchio, Giuseppe and Wright, Marvin N. and Bischl, Bernd},
+  author = {Molnar, Christoph and Freiesleben, Timo and K{\"o}nig, Gunnar and Casalicchio, Giuseppe and Wright, Marvin N. and Bischl, Bernd},
   year = {2021},
-  number = {arXiv:2109.01433},
   eprint = {2109.01433},
-  publisher = {{arXiv}},
+  url = {http://arxiv.org/abs/2109.01433},
   urldate = {2023-02-08},
   archiveprefix = {arxiv}
 }
@@ -3067,8 +2972,7 @@ @article{muravyevOptionsTradingCosts2020
   journal = {The Review of Financial Studies},
   volume = {33},
   number = {11},
-  doi = {10.1093/rfs/hhaa010},
-  urldate = {2023-02-25}
+  doi = {10.1093/rfs/hhaa010}
 }
 
 @article{muravyevOrderFlowExpected2016,
@@ -3113,17 +3017,15 @@ @misc{narangTransformerModificationsTransfer2021
   title = {Do Transformer Modifications Transfer across Implementations and Applications?},
   author = {Narang, Sharan and Chung, Hyung Won and Tay, Yi and Fedus, William and Fevry, Thibault and Matena, Michael and Malkan, Karishma and Fiedel, Noah and Shazeer, Noam and Lan, Zhenzhong and Zhou, Yanqi and Li, Wei and Ding, Nan and Marcus, Jake and Roberts, Adam and Raffel, Colin},
   year = {2021},
-  number = {arXiv:2102.11972},
   eprint = {2102.11972},
-  publisher = {{arXiv}},
-  urldate = {2023-01-16},
   archiveprefix = {arxiv}
 }
 
 @misc{nasdaqincFrequentlyAskedQuestions2017,
   title = {Frequently Asked Questions {{ISE}} Open/Close Trade Profile {{GEMX}} Open/Close Trade Profile},
-  author = {{Nasdaq Inc.}},
+  author = {{NASDAQ Inc.}},
   year = {2017},
+  url = {https://www.nasdaqtrader.com/content/ProductsServices/DATAPRODUCTS/ISE/ISE-GEMX%20Consolidated%20Trade%20Profile%20FAQs%20v2.pdF},
   urldate = {2023-03-03}
 }
 
@@ -3136,26 +3038,26 @@ @incollection{nelsonMachineLearningStrategic2023
   series = {Advanced {{Sciences}} and {{Technologies}} for {{Security Applications}}},
   publisher = {{Springer International Publishing}},
   address = {{Cham}},
-  doi = {10.1007/978-3-031-20036-6_10},
+  doi = {10.1007/978-3-031-20036-6\_10},
   urldate = {2022-12-29}
 }
 
 @misc{NetflixUpdateTry,
   title = {Netflix Update: Try This at Home},
-  urldate = {2021-04-20},
-  howpublished = {https://sifter.org/\textasciitilde simon/journal/20061211.html}
+  url = {https://sifter.org/~simon/journal/20061211.html},
+  urldate = {2021-04-20}
 }
 
 @incollection{neumannMotivatingSupportingUser2007,
   title = {Motivating and Supporting User Interaction with Recommender Systems},
   booktitle = {Research and {{Advanced Technology}} for {{Digital Libraries}}},
   author = {Neumann, Andreas W.},
-  editor = {Kovács, László and Fuhr, Norbert and Meghini, Carlo},
+  editor = {Kov{\'a}cs, L{\'a}szl{\'o} and Fuhr, Norbert and Meghini, Carlo},
   year = {2007},
   volume = {4675},
   publisher = {{Springer Berlin Heidelberg}},
   address = {{Berlin, Heidelberg}},
-  doi = {10.1007/978-3-540-74851-9_36},
+  doi = {10.1007/978-3-540-74851-9\_36},
   urldate = {2021-03-21}
 }
 
@@ -3170,7 +3072,7 @@ @inproceedings{ngFeatureSelectionVs2004
   urldate = {2023-01-28}
 }
 
-@article{nguyenTransformersTearsImproving2019,
+@misc{nguyenTransformersTearsImproving2019,
   title = {Transformers without Tears: Improving the Normalization of Self-Attention},
   author = {Nguyen, Toan Q. and Salazar, Julian},
   year = {2019},
@@ -3226,9 +3128,8 @@ @misc{noriAccuracyInterpretabilityDifferential2021
   title = {Accuracy, Interpretability, and Differential Privacy via Explainable Boosting},
   author = {Nori, Harsha and Caruana, Rich and Bu, Zhiqi and Shen, Judy Hanwen and Kulkarni, Janardhan},
   year = {2021},
-  number = {arXiv:2106.09680},
   eprint = {2106.09680},
-  publisher = {{arXiv}},
+  url = {http://arxiv.org/abs/2106.09680},
   urldate = {2022-12-03},
   archiveprefix = {arxiv}
 }
@@ -3252,7 +3153,7 @@ @incollection{nowakAccuracyTradeClassification2020
   year = {2020},
   publisher = {{Springer International Publishing}},
   address = {{Cham}},
-  doi = {10.1007/978-3-030-43078-8_6},
+  doi = {10.1007/978-3-030-43078-8\_6},
   urldate = {2023-02-01}
 }
 
@@ -3261,14 +3162,14 @@ @misc{ntakourisTimeSeriesTransformer2021
   author = {Ntakouris, Theodoros},
   year = {2021},
   journal = {Medium},
-  urldate = {2021-11-06},
-  howpublished = {https://towardsdatascience.com/the-time-series-transformer-2a521a0efad3}
+  url = {https://towardsdatascience.com/the-time-series-transformer-2a521a0efad3},
+  urldate = {2021-11-06}
 }
 
 @misc{Ochama,
   title = {Ochama},
-  urldate = {2023-02-17},
-  howpublished = {https://www.ochama.com/cart/}
+  url = {https://www.ochama.com/cart/},
+  urldate = {2023-02-17}
 }
 
 @article{odders-whiteOccurrenceConsequencesInaccurate2000,
@@ -3283,21 +3184,20 @@ @article{odders-whiteOccurrenceConsequencesInaccurate2000
 
 @article{olbrysEvaluatingTradeSide2018,
   title = {Evaluating Trade Side Classification Algorithms Using Intraday Data from the Warsaw Stock Exchange},
-  author = {Olbrys, Joanna and Mursztyn, Michał},
+  author = {Olbrys, Joanna and Mursztyn, Micha{\l}},
   year = {2018},
   publisher = {{Karlsruhe}},
   doi = {10.5445/KSP/1000085951/20},
   urldate = {2022-10-03},
-  copyright = {Closed Access, Creative Commons Namensnennung – Weitergabe unter gleichen Bedingungen 4.0 International}
+  copyright = {Closed Access, Creative Commons Namensnennung \textendash{} Weitergabe unter gleichen Bedingungen 4.0 International}
 }
 
 @misc{oliverRealisticEvaluationDeep2019,
   title = {Realistic Evaluation of Deep Semi-Supervised Learning Algorithms},
   author = {Oliver, Avital and Odena, Augustus and Raffel, Colin and Cubuk, Ekin D. and Goodfellow, Ian J.},
   year = {2019},
-  number = {arXiv:1804.09170},
   eprint = {1804.09170},
-  publisher = {{arXiv}},
+  url = {http://arxiv.org/abs/1804.09170},
   urldate = {2022-10-30},
   archiveprefix = {arxiv}
 }
@@ -3305,8 +3205,8 @@ @misc{oliverRealisticEvaluationDeep2019
 @misc{OptionTrades,
   title = {Option Trades},
   journal = {Cboe DataShop},
-  urldate = {2022-11-08},
-  howpublished = {https://datashop.cboe.com/option-trades}
+  url = {https://datashop.cboe.com/option-trades},
+  urldate = {2022-11-08}
 }
 
 @book{owenHyperparameterTuningPython2022,
@@ -3347,10 +3247,9 @@ @article{panInformationOptionVolume2006
 @inproceedings{parmarImageTransformer2018,
   title = {Image Transformer},
   booktitle = {Proceedings of the 35th {{International Conference}} on {{Machine Learning}}},
-  author = {Parmar, Niki and Vaswani, Ashish and Uszkoreit, Jakob and Kaiser, Łukasz and Shazeer, Noam and Ku, Alexander and Tran, Dustin},
+  author = {Parmar, Niki and Vaswani, Ashish and Uszkoreit, Jakob and Kaiser, {\L}ukasz and Shazeer, Noam and Ku, Alexander and Tran, Dustin},
   year = {2018},
-  publisher = {{PMLR}},
-  urldate = {2023-01-18}
+  publisher = {{PMLR}}
 }
 
 @inproceedings{paszkePyTorchImperativeStyle2019,
@@ -3368,27 +3267,24 @@ @misc{patrignaniWhyShouldAnyone2021
   title = {Why Should Anyone Use Colours? Or, Syntax Highlighting beyond Code Snippets},
   author = {Patrignani, Marco},
   year = {2021},
-  number = {arXiv:2001.11334},
   eprint = {2001.11334},
-  publisher = {{arXiv}},
   urldate = {2022-12-21},
   archiveprefix = {arxiv}
 }
 
 @misc{pedregosaScikitlearnMachineLearning2018,
   title = {Scikit-Learn: Machine Learning in Python},
-  author = {Pedregosa, Fabian and Varoquaux, Gaël and Gramfort, Alexandre and Michel, Vincent and Thirion, Bertrand and Grisel, Olivier and Blondel, Mathieu and Müller, Andreas and Nothman, Joel and Louppe, Gilles and Prettenhofer, Peter and Weiss, Ron and Dubourg, Vincent and Vanderplas, Jake and Passos, Alexandre and Cournapeau, David and Brucher, Matthieu and Perrot, Matthieu and Duchesnay, Édouard},
+  author = {Pedregosa, Fabian and Varoquaux, Ga{\"e}l and Gramfort, Alexandre and Michel, Vincent and Thirion, Bertrand and Grisel, Olivier and Blondel, Mathieu and M{\"u}ller, Andreas and Nothman, Joel and Louppe, Gilles and Prettenhofer, Peter and Weiss, Ron and Dubourg, Vincent and Vanderplas, Jake and Passos, Alexandre and Cournapeau, David and Brucher, Matthieu and Perrot, Matthieu and Duchesnay, {\'E}douard},
   year = {2018},
-  number = {arXiv:1201.0490},
   eprint = {1201.0490},
-  publisher = {{arXiv}},
+  url = {http://arxiv.org/abs/1201.0490},
   urldate = {2022-10-13},
   archiveprefix = {arxiv}
 }
 
 @article{perez-lebelBenchmarkingMissingvaluesApproaches2022,
   title = {Benchmarking Missing-Values Approaches for Predictive Models on Health Databases},
-  author = {{Perez-Lebel}, Alexandre and Varoquaux, Gaël and Le~Morvan, Marine and Josse, Julie and Poline, Jean-Baptiste},
+  author = {{Perez-Lebel}, Alexandre and Varoquaux, Ga{\"e}l and Le~Morvan, Marine and Josse, Julie and Poline, Jean-Baptiste},
   year = {2022},
   journal = {GigaScience},
   volume = {11},
@@ -3431,6 +3327,16 @@ @article{petersonEvaluationBiasesExecution2003
   doi = {10.1016/S1386-4181(02)00065-4}
 }
 
+@misc{petersTuneNotTune2019,
+  title = {To {{Tune}} or {{Not}} to {{Tune}}? {{Adapting Pretrained Representations}} to {{Diverse Tasks}}},
+  author = {Peters, Matthew E. and Ruder, Sebastian and Smith, Noah A.},
+  year = {2019},
+  eprint = {1903.05987},
+  url = {http://arxiv.org/abs/1903.05987},
+  urldate = {2023-07-04},
+  archiveprefix = {arxiv}
+}
+
 @incollection{petitUNetTransformerSelf2021,
   title = {U-Net Transformer: Self and Cross Attention for Medical Image Segmentation},
   booktitle = {Machine {{Learning}} in {{Medical Imaging}}},
@@ -3440,7 +3346,7 @@ @incollection{petitUNetTransformerSelf2021
   volume = {12966},
   publisher = {{Springer International Publishing}},
   address = {{Cham}},
-  doi = {10.1007/978-3-030-87589-3_28},
+  doi = {10.1007/978-3-030-87589-3\_28},
   urldate = {2023-01-20}
 }
 
@@ -3448,26 +3354,21 @@ @misc{phuongFormalAlgorithmsTransformers2022
   title = {Formal Algorithms for Transformers},
   author = {Phuong, Mary and Hutter, Marcus},
   year = {2022},
-  number = {arXiv:2207.09238},
   eprint = {2207.09238},
-  publisher = {{arXiv}},
   archiveprefix = {arxiv}
 }
 
 @article{Piwowar_2006,
-  title = {The Sensitivity of Effective Spread Estimates to Trade–Quote Matching Algorithms},
+  title = {The Sensitivity of Effective Spread Estimates to Trade\textendash Quote Matching Algorithms},
   author = {Piwowar, Michael S. and Wei, Li},
   year = {2006},
   journal = {Electronic Markets},
-  doi = {10.1080/10196780600643803},
-  mag_id = {2013480621},
-  pmcid = {null},
-  pmid = {null}
+  doi = {10.1080/10196780600643803}
 }
 
 @article{popelTrainingTipsTransformer2018,
   title = {Training Tips for the Transformer Model},
-  author = {Popel, Martin and Bojar, Ondřej},
+  author = {Popel, Martin and Bojar, Ond{\v r}ej},
   year = {2018},
   journal = {The Prague Bulletin of Mathematical Linguistics},
   volume = {110},
@@ -3482,16 +3383,15 @@ @misc{popovNeuralObliviousDecision2019
   title = {Neural {{Oblivious Decision Ensembles}} for {{Deep Learning}} on {{Tabular Data}}},
   author = {Popov, Sergei and Morozov, Stanislav and Babenko, Artem},
   year = {2019},
-  number = {arXiv:1909.06312},
   eprint = {1909.06312},
-  publisher = {{arXiv}},
+  url = {http://arxiv.org/abs/1909.06312},
   urldate = {2023-05-15},
   archiveprefix = {arxiv}
 }
 
 @article{poppeSensitivityVPINChoice2016,
   title = {The Sensitivity of Vpin to the Choice of Trade Classification Algorithm},
-  author = {Pöppe, Thomas and Moos, Sebastian and Schiereck, Dirk},
+  author = {P{\"o}ppe, Thomas and Moos, Sebastian and Schiereck, Dirk},
   year = {2016},
   journal = {Journal of Banking \& Finance},
   volume = {73},
@@ -3526,9 +3426,8 @@ @misc{powerGrokkingGeneralizationOverfitting2022
   title = {Grokking: Generalization beyond Overfitting on Small Algorithmic Datasets},
   author = {Power, Alethea and Burda, Yuri and Edwards, Harri and Babuschkin, Igor and Misra, Vedant},
   year = {2022},
-  number = {arXiv:2201.02177},
   eprint = {2201.02177},
-  publisher = {{arXiv}},
+  url = {http://arxiv.org/abs/2201.02177},
   urldate = {2023-03-05},
   archiveprefix = {arxiv}
 }
@@ -3537,9 +3436,8 @@ @misc{pressImprovingTransformerModels2020
   title = {Improving Transformer Models by Reordering Their Sublayers},
   author = {Press, Ofir and Smith, Noah A. and Levy, Omer},
   year = {2020},
-  number = {arXiv:1911.03864},
   eprint = {1911.03864},
-  publisher = {{arXiv}},
+  url = {http://arxiv.org/abs/1911.03864},
   urldate = {2023-01-10},
   archiveprefix = {arxiv}
 }
@@ -3557,7 +3455,7 @@ @inproceedings{prokhorenkovaCatBoostUnbiasedBoosting2018
 
 @article{prollochsNegationScopeDetection2020,
   title = {Negation Scope Detection for Sentiment Analysis: A Reinforcement Learning Framework for Replicating Human Interpretations},
-  author = {Pröllochs, Nicolas and Feuerriegel, Stefan and Lutz, Bernhard and Neumann, Dirk},
+  author = {Pr{\"o}llochs, Nicolas and Feuerriegel, Stefan and Lutz, Bernhard and Neumann, Dirk},
   year = {2020},
   journal = {Information Sciences},
   volume = {536},
@@ -3574,9 +3472,8 @@ @misc{pulugundlaAttentionbasedNeuralBeamforming2021
   title = {Attention-Based Neural Beamforming Layers for Multi-Channel Speech Recognition},
   author = {Pulugundla, Bhargav and Gao, Yang and King, Brian and Keskin, Gokce and Mallidi, Harish and Wu, Minhua and Droppo, Jasha and Maas, Roland},
   year = {2021},
-  number = {arXiv:2105.05920},
   eprint = {2105.05920},
-  publisher = {{arXiv}},
+  url = {http://arxiv.org/abs/2105.05920},
   urldate = {2023-01-16},
   archiveprefix = {arxiv}
 }
@@ -3590,23 +3487,17 @@ @misc{raeScalingLanguageModels2022
   title = {Scaling {{Language Models}}: {{Methods}}, {{Analysis}} \& {{Insights}} from {{Training Gopher}}},
   author = {Rae, Jack W. and Borgeaud, Sebastian and Cai, Trevor and Millican, Katie and Hoffmann, Jordan and Song, Francis and Aslanides, John and Henderson, Sarah and Ring, Roman and Young, Susannah and Rutherford, Eliza and Hennigan, Tom and Menick, Jacob and Cassirer, Albin and Powell, Richard and Driessche, George van den and Hendricks, Lisa Anne and Rauh, Maribeth and Huang, Po-Sen and Glaese, Amelia and Welbl, Johannes and Dathathri, Sumanth and Huang, Saffron and Uesato, Jonathan and Mellor, John and Higgins, Irina and Creswell, Antonia and McAleese, Nat and Wu, Amy and Elsen, Erich and Jayakumar, Siddhant and Buchatskaya, Elena and Budden, David and Sutherland, Esme and Simonyan, Karen and Paganini, Michela and Sifre, Laurent and Martens, Lena and Li, Xiang Lorraine and Kuncoro, Adhiguna and Nematzadeh, Aida and Gribovskaya, Elena and Donato, Domenic and Lazaridou, Angeliki and Mensch, Arthur and Lespiau, Jean-Baptiste and Tsimpoukelli, Maria and Grigorev, Nikolai and Fritz, Doug and Sottiaux, Thibault and Pajarskas, Mantas and Pohlen, Toby and Gong, Zhitao and Toyama, Daniel and {d'Autume}, Cyprien de Masson and Li, Yujia and Terzi, Tayfun and Mikulik, Vladimir and Babuschkin, Igor and Clark, Aidan and Casas, Diego de Las and Guy, Aurelia and Jones, Chris and Bradbury, James and Johnson, Matthew and Hechtman, Blake and Weidinger, Laura and Gabriel, Iason and Isaac, William and Lockhart, Ed and Osindero, Simon and Rimell, Laura and Dyer, Chris and Vinyals, Oriol and Ayoub, Kareem and Stanway, Jeff and Bennett, Lorrayne and Hassabis, Demis and Kavukcuoglu, Koray and Irving, Geoffrey},
   year = {2022},
-  number = {arXiv:2112.11446},
   eprint = {2112.11446},
-  publisher = {{arXiv}},
-  urldate = {2023-06-17},
   archiveprefix = {arxiv}
 }
 
-@misc{raffelExploringLimitsTransfer2020,
+@article{raffelExploringLimitsTransfer2020,
   title = {Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer},
   author = {Raffel, Colin and Shazeer, Noam and Roberts, Adam and Lee, Katherine and Narang, Sharan and Matena, Michael and Zhou, Yanqi and Li, Wei and Liu, Peter J.},
   year = {2020},
-  number = {arXiv:1910.10683},
-  eprint = {1910.10683},
-  publisher = {{arXiv}},
-  doi = {10.48550/arXiv.1910.10683},
-  urldate = {2023-01-18},
-  archiveprefix = {arxiv}
+  journal = {Journal of Machine Learning Research},
+  volume = {21},
+  number = {140}
 }
 
 @article{raschkaIntroductionLatestTechniques2021,
@@ -3627,9 +3518,8 @@ @misc{raschkaModelEvaluationModel2020
   title = {Model Evaluation, Model Selection, and Algorithm Selection in Machine Learning},
   author = {Raschka, Sebastian},
   year = {2020},
-  number = {arXiv:1811.12808},
   eprint = {1811.12808},
-  publisher = {{arXiv}},
+  url = {http://arxiv.org/abs/1811.12808},
   urldate = {2023-01-16},
   archiveprefix = {arxiv}
 }
@@ -3642,8 +3532,8 @@ @article{raschkaRecentTrendsTechnologies2021
 
 @misc{RecipeTrainingNeural,
   title = {A Recipe for Training Neural Networks},
-  urldate = {2021-12-03},
-  howpublished = {http://karpathy.github.io/2019/04/25/recipe/}
+  url = {http://karpathy.github.io/2019/04/25/recipe/},
+  urldate = {2021-12-03}
 }
 
 @article{ribeiroEnsembleApproachBased2020,
@@ -3661,20 +3551,22 @@ @article{rogersPrimerBERTologyWhat2020
   year = {2020},
   journal = {Transactions of the Association for Computational Linguistics},
   volume = {8},
-  doi = {10.1162/tacl_a_00349},
+  doi = {10.1162/tacl\_a\_00349},
   urldate = {2023-06-17}
 }
 
 @misc{ronenMachineLearningTrade2022,
   title = {Machine Learning and Trade Direction Classification: Insights from the Corporate Bond Market},
-  author = {Ronen, Tavy and Fedenia, Mark A. and Nam, Seunghan},
+  author = {Fedenia, Mark A. and Ronen, Tavy and Nam, Seunghan},
   year = {2022},
-  doi = {10.2139/ssrn.4213313}
+  eprint = {ssrn.4213313},
+  doi = {10.2139/ssrn.4213313},
+  archiveprefix = {SSRN}
 }
 
 @article{rosenthalModelingTradeDirection2012,
   title = {Modeling Trade Direction},
-  author = {Rosenthal, D. W. R.},
+  author = {Rosenthal, Dale W. R.},
   year = {2012},
   journal = {Journal of Financial Econometrics},
   volume = {10},
@@ -3699,11 +3591,10 @@ @book{rothmanTransformersNaturalLanguage2021
 
 @misc{rozemberczkiShapleyValueMachine2022,
   title = {The Shapley Value in Machine Learning},
-  author = {Rozemberczki, Benedek and Watson, Lauren and Bayer, Péter and Yang, Hao-Tsung and Kiss, Olivér and Nilsson, Sebastian and Sarkar, Rik},
+  author = {Rozemberczki, Benedek and Watson, Lauren and Bayer, P{\'e}ter and Yang, Hao-Tsung and Kiss, Oliv{\'e}r and Nilsson, Sebastian and Sarkar, Rik},
   year = {2022},
-  number = {arXiv:2202.05594},
   eprint = {2202.05594},
-  publisher = {{arXiv}},
+  url = {http://arxiv.org/abs/2202.05594},
   urldate = {2022-10-27},
   archiveprefix = {arxiv}
 }
@@ -3712,10 +3603,7 @@ @misc{rubachevRevisitingPretrainingObjectives2022
   title = {Revisiting Pretraining Objectives for Tabular Deep Learning},
   author = {Rubachev, Ivan and Alekberov, Artem and Gorishniy, Yury and Babenko, Artem},
   year = {2022},
-  number = {arXiv:2207.03208},
   eprint = {2207.03208},
-  publisher = {{arXiv}},
-  urldate = {2022-12-09},
   archiveprefix = {arxiv}
 }
 
@@ -3726,9 +3614,6 @@ @article{rubinBayesianBootstrap1981
   journal = {The Annals of Statistics},
   volume = {9},
   number = {1},
-  eprint = {2240875},
-  eprinttype = {jstor},
-  publisher = {{Institute of Mathematical Statistics}},
   doi = {10.1214/aos/1176345338},
   urldate = {2023-06-30}
 }
@@ -3828,6 +3713,8 @@ @techreport{securitiesandexchangecommissionReportConcerningExaminations2007
   title = {Report {{Concerning Examinations}} of {{Options Order Routing}} and {{Execution}}},
   author = {{Securities and Exchange Commission}},
   year = {2007},
+  institution = {{Securities and Exchange Commission}},
+  url = {https://www.sec.gov/files/optionsroutingreport.pdf},
   urldate = {2023-06-26}
 }
 
@@ -3849,7 +3736,7 @@ @article{shallueMeasuringEffectsData
 @article{shaniEvaluatingRecommendationSystems,
   title = {Evaluating Recommendation Systems},
   author = {Shani, Guy and Gunawardana, Asela},
-  doi = {10.1007/978-0-387-85820-3_8}
+  doi = {10.1007/978-0-387-85820-3\_8}
 }
 
 @incollection{shapley17ValueNPerson1953,
@@ -3859,8 +3746,7 @@ @incollection{shapley17ValueNPerson1953
   editor = {Kuhn, Harold William and Tucker, Albert William},
   year = {1953},
   publisher = {{Princeton University Press}},
-  doi = {10.1515/9781400881970-018},
-  urldate = {2023-06-22}
+  doi = {10.1515/9781400881970-018}
 }
 
 @inproceedings{shavittRegularizationLearningNetworks2018,
@@ -3868,27 +3754,24 @@ @inproceedings{shavittRegularizationLearningNetworks2018
   booktitle = {32nd {{Conference}} on {{Neural Information Processing Systems}}},
   author = {Shavitt, Ira and Segal, Eran},
   year = {2018},
-  address = {{Montréal}}
+  address = {{Montr\'eal}}
 }
 
 @misc{shazeerAdafactorAdaptiveLearning2018,
   title = {Adafactor: Adaptive Learning Rates with Sublinear Memory Cost},
   author = {Shazeer, Noam and Stern, Mitchell},
   year = {2018},
-  number = {arXiv:1804.04235},
   eprint = {1804.04235},
-  publisher = {{arXiv}},
+  url = {http://arxiv.org/abs/1804.04235},
   urldate = {2023-01-19},
   archiveprefix = {arxiv}
 }
 
 @misc{shazeerGLUVariantsImprove2020,
-  title = {{{GLU}} Variants Improve Transformer},
+  title = {{{GLU Variants Improve Transformer}}},
   author = {Shazeer, Noam},
   year = {2020},
-  number = {arXiv:2002.05202},
   eprint = {2002.05202},
-  publisher = {{arXiv}},
   archiveprefix = {arxiv}
 }
 
@@ -3912,9 +3795,8 @@ @misc{shwartz-zivTabularDataDeep2021
   title = {Tabular Data: Deep Learning Is Not All You Need},
   author = {{Shwartz-Ziv}, Ravid and Armon, Amitai},
   year = {2021},
-  number = {arXiv:2106.03253},
   eprint = {2106.03253},
-  publisher = {{arXiv}},
+  url = {http://arxiv.org/abs/2106.03253},
   urldate = {2022-10-05},
   archiveprefix = {arxiv}
 }
@@ -3922,20 +3804,19 @@ @misc{shwartz-zivTabularDataDeep2021
 @inproceedings{smiejaProcessingMissingData2018,
   title = {Processing of Missing Data by Neural Networks},
   booktitle = {Advances in {{Neural Information Processing Systems}}},
-  author = {Śmieja, Marek and Struski, Łukasz and Tabor, Jacek and Zieliński, Bartosz and Spurek, Przemysław},
+  author = {{\'S}mieja, Marek and Struski, {\L}ukasz and Tabor, Jacek and Zieli{\'n}ski, Bartosz and Spurek, Przemys{\l}aw},
   year = {2018},
   volume = {31},
   publisher = {{Curran Associates, Inc.}},
+  url = {https://proceedings.neurips.cc/paper/2018/hash/411ae1bf081d1674ca6091f8c59a266f-Abstract.html},
   urldate = {2022-11-28}
 }
 
 @misc{smilkovSmoothGradRemovingNoise2017,
   title = {{{SmoothGrad}}: Removing Noise by Adding Noise},
-  author = {Smilkov, Daniel and Thorat, Nikhil and Kim, Been and Viégas, Fernanda and Wattenberg, Martin},
+  author = {Smilkov, Daniel and Thorat, Nikhil and Kim, Been and Vi{\'e}gas, Fernanda and Wattenberg, Martin},
   year = {2017},
-  number = {arXiv:1706.03825},
   eprint = {1706.03825},
-  publisher = {{arXiv}},
   doi = {10.48550/arXiv.1706.03825},
   urldate = {2022-12-17},
   archiveprefix = {arxiv}
@@ -3945,9 +3826,8 @@ @misc{smithAperiodicMonotile2023
   title = {An Aperiodic Monotile},
   author = {Smith, David and Myers, Joseph Samuel and Kaplan, Craig S. and {Goodman-Strauss}, Chaim},
   year = {2023},
-  number = {arXiv:2303.10798},
   eprint = {2303.10798},
-  publisher = {{arXiv}},
+  url = {http://arxiv.org/abs/2303.10798},
   urldate = {2023-05-15},
   archiveprefix = {arxiv}
 }
@@ -3956,9 +3836,7 @@ @misc{smithCyclicalLearningRates2017
   title = {Cyclical Learning Rates for Training Neural Networks},
   author = {Smith, Leslie N.},
   year = {2017},
-  number = {arXiv:1506.01186},
   eprint = {1506.01186},
-  publisher = {{arXiv}},
   archiveprefix = {arxiv}
 }
 
@@ -3966,9 +3844,8 @@ @misc{smithDonDecayLearning2018
   title = {Don't Decay the Learning Rate, Increase the Batch Size},
   author = {Smith, Samuel L. and Kindermans, Pieter-Jan and Ying, Chris and Le, Quoc V.},
   year = {2018},
-  number = {arXiv:1711.00489},
   eprint = {1711.00489},
-  publisher = {{arXiv}},
+  url = {http://arxiv.org/abs/1711.00489},
   urldate = {2022-12-20},
   archiveprefix = {arxiv}
 }
@@ -3980,14 +3857,14 @@ @inproceedings{snoekPracticalBayesianOptimization2012
   year = {2012},
   volume = {25},
   publisher = {{Curran Associates, Inc.}},
+  url = {https://papers.nips.cc/paper/2012/hash/05311655a15b75fab86956663e1819cd-Abstract.html},
   urldate = {2022-11-01}
 }
 
 @misc{somepalliSaintImprovedNeural2021,
-  title = {Saint: Improved Neural Networks for Tabular Data via Row Attention and Contrastive Pre-Training},
+  title = {{{SAINT}}: Improved Neural Networks for Tabular Data via Row Attention and Contrastive Pre-Training},
   author = {Somepalli, Gowthami and Goldblum, Micah and Schwarzschild, Avi and Bruss, C. Bayan and Goldstein, Tom},
   year = {2021},
-  number = {arXiv:2106.01342},
   eprint = {2106.01342},
   urldate = {2022-10-04},
   archiveprefix = {arxiv}
@@ -4000,7 +3877,6 @@ @inproceedings{songAutoIntAutomaticFeature2019
   year = {2019},
   eprint = {1810.11921},
   doi = {10.1145/3357384.3357925},
-  urldate = {2023-01-25},
   archiveprefix = {arxiv}
 }
 
@@ -4008,6 +3884,7 @@ @misc{SparseAutoencodersUsing2020
   title = {Sparse Autoencoders Using L1 Regularization with {{PyTorch}}},
   year = {2020},
   journal = {DebuggerCafe},
+  url = {https://debuggercafe.com/sparse-autoencoders-using-l1-regularization-with-pytorch/},
   urldate = {2021-11-15}
 }
 
@@ -4024,6 +3901,7 @@ @misc{statquestwithjoshstarmerGradientBoostPart2019
   title = {Gradient Boost Part 2 (of 4): Regression Details},
   author = {{StatQuest with Josh Starmer}},
   year = {2019},
+  url = {https://www.youtube.com/watch?v=2xudPOBz-vs},
   urldate = {2021-12-25}
 }
 
@@ -4065,46 +3943,40 @@ @misc{sukhbaatarAugmentingSelfattentionPersistent2019
   title = {Augmenting Self-Attention with Persistent Memory},
   author = {Sukhbaatar, Sainbayar and Grave, Edouard and Lample, Guillaume and Jegou, Herve and Joulin, Armand},
   year = {2019},
-  number = {arXiv:1907.01470},
   eprint = {1907.01470},
-  publisher = {{arXiv}},
-  urldate = {2023-01-17},
   archiveprefix = {arxiv}
 }
 
 @incollection{sunAdaBoostLSTMEnsembleLearning2018,
   title = {{{AdaBoost-LSTM}} Ensemble Learning for Financial Time Series Forecasting},
-  booktitle = {Computational {{Science}} – {{ICCS}} 2018},
+  booktitle = {Computational {{Science}} \textendash{} {{ICCS}} 2018},
   author = {Sun, Shaolong and Wei, Yunjie and Wang, Shouyang},
   editor = {Shi, Yong and Fu, Haohuan and Tian, Yingjie and Krzhizhanovskaya, Valeria V. and Lees, Michael Harold and Dongarra, Jack and Sloot, Peter M. A.},
   year = {2018},
   volume = {10862},
   publisher = {{Springer International Publishing}},
   address = {{Cham}},
-  doi = {10.1007/978-3-319-93713-7_55}
+  doi = {10.1007/978-3-319-93713-7\_55}
 }
 
 @misc{suRoFormerEnhancedTransformer2022,
   title = {{{RoFormer}}: Enhanced Transformer with Rotary Position Embedding},
   author = {Su, Jianlin and Lu, Yu and Pan, Shengfeng and Murtadha, Ahmed and Wen, Bo and Liu, Yunfeng},
   year = {2022},
-  number = {arXiv:2104.09864},
   eprint = {2104.09864},
-  publisher = {{arXiv}},
+  url = {http://arxiv.org/abs/2104.09864},
   urldate = {2023-04-14},
   archiveprefix = {arxiv}
 }
 
-@misc{sutskeverSequenceSequenceLearning2014,
+@inproceedings{sutskeverSequenceSequenceLearning2014,
   title = {Sequence to Sequence Learning with Neural Networks},
+  booktitle = {Proceedings of the 27th {{International Conference}} on {{Neural Information Processing}}},
   author = {Sutskever, Ilya and Vinyals, Oriol and Le, Quoc V.},
   year = {2014},
-  number = {arXiv:1409.3215},
-  eprint = {1409.3215},
-  publisher = {{arXiv}},
-  doi = {10.48550/arXiv.1409.3215},
-  urldate = {2023-01-30},
-  archiveprefix = {arxiv}
+  volume = {2},
+  publisher = {{MIT Press}},
+  address = {{Montreal, QC, Canada}}
 }
 
 @inproceedings{szegedyRethinkingInceptionArchitecture2016,
@@ -4122,9 +3994,8 @@ @misc{takaseLayerNormalizationsResidual2022
   title = {On Layer Normalizations and Residual Connections in Transformers},
   author = {Takase, Sho and Kiyono, Shun and Kobayashi, Sosuke and Suzuki, Jun},
   year = {2022},
-  number = {arXiv:2206.00330},
   eprint = {2206.00330},
-  publisher = {{arXiv}},
+  url = {http://arxiv.org/abs/2206.00330},
   urldate = {2023-01-19},
   archiveprefix = {arxiv}
 }
@@ -4154,10 +4025,7 @@ @misc{tayEfficientTransformersSurvey2022
   title = {Efficient Transformers: A Survey},
   author = {Tay, Yi and Dehghani, Mostafa and Bahri, Dara and Metzler, Donald},
   year = {2022},
-  number = {arXiv:2009.06732},
   eprint = {2009.06732},
-  publisher = {{arXiv}},
-  urldate = {2022-12-05},
   archiveprefix = {arxiv}
 }
 
@@ -4165,9 +4033,8 @@ @misc{taylorGalacticaLargeLanguage2022
   title = {Galactica: A Large Language Model for Science},
   author = {Taylor, Ross and Kardas, Marcin and Cucurull, Guillem and Scialom, Thomas and Hartshorn, Anthony and Saravia, Elvis and Poulton, Andrew and Kerkez, Viktor and Stojnic, Robert},
   year = {2022},
-  number = {arXiv:2211.09085},
   eprint = {2211.09085},
-  publisher = {{arXiv}},
+  url = {http://arxiv.org/abs/2211.09085},
   urldate = {2023-05-03},
   archiveprefix = {arxiv}
 }
@@ -4197,9 +4064,8 @@ @misc{thoppilanLaMDALanguageModels2022
   title = {{{LaMDA}}: Language Models for Dialog Applications},
   author = {Thoppilan, Romal and De Freitas, Daniel and Hall, Jamie and Shazeer, Noam and Kulshreshtha, Apoorv and Cheng, Heng-Tze and Jin, Alicia and Bos, Taylor and Baker, Leslie and Du, Yu and Li, YaGuang and Lee, Hongrae and Zheng, Huaixiu Steven and Ghafouri, Amin and Menegali, Marcelo and Huang, Yanping and Krikun, Maxim and Lepikhin, Dmitry and Qin, James and Chen, Dehao and Xu, Yuanzhong and Chen, Zhifeng and Roberts, Adam and Bosma, Maarten and Zhao, Vincent and Zhou, Yanqi and Chang, Chung-Ching and Krivokon, Igor and Rusch, Will and Pickett, Marc and Srinivasan, Pranesh and Man, Laichee and {Meier-Hellstern}, Kathleen and Morris, Meredith Ringel and Doshi, Tulsee and Santos, Renelito Delos and Duke, Toju and Soraker, Johnny and Zevenbergen, Ben and Prabhakaran, Vinodkumar and Diaz, Mark and Hutchinson, Ben and Olson, Kristen and Molina, Alejandra and {Hoffman-John}, Erin and Lee, Josh and Aroyo, Lora and Rajakumar, Ravi and Butryna, Alena and Lamm, Matthew and Kuzmina, Viktoriya and Fenton, Joe and Cohen, Aaron and Bernstein, Rachel and Kurzweil, Ray and {Aguera-Arcas}, Blaise and Cui, Claire and Croak, Marian and Chi, Ed and Le, Quoc},
   year = {2022},
-  number = {arXiv:2201.08239},
   eprint = {2201.08239},
-  publisher = {{arXiv}},
+  url = {http://arxiv.org/abs/2201.08239},
   urldate = {2023-01-31},
   archiveprefix = {arxiv}
 }
@@ -4212,25 +4078,24 @@ @article{tobekDoesSourceFundamental
 
 @misc{touvronLLaMAOpenEfficient2023,
   title = {{{LLaMA}}: Open and Efficient Foundation Language Models},
-  author = {Touvron, Hugo and Lavril, Thibaut and Izacard, Gautier and Martinet, Xavier and Lachaux, Marie-Anne and Lacroix, Timothée and Rozière, Baptiste and Goyal, Naman and Hambro, Eric and Azhar, Faisal and Rodriguez, Aurelien and Joulin, Armand and Grave, Edouard and Lample, Guillaume},
+  author = {Touvron, Hugo and Lavril, Thibaut and Izacard, Gautier and Martinet, Xavier and Lachaux, Marie-Anne and Lacroix, Timoth{\'e}e and Rozi{\`e}re, Baptiste and Goyal, Naman and Hambro, Eric and Azhar, Faisal and Rodriguez, Aurelien and Joulin, Armand and Grave, Edouard and Lample, Guillaume},
   year = {2023},
-  number = {arXiv:2302.13971},
   eprint = {2302.13971},
-  publisher = {{arXiv}},
+  url = {http://arxiv.org/abs/2302.13971},
   urldate = {2023-04-29},
   archiveprefix = {arxiv}
 }
 
 @misc{TransformerArchitecturePositional,
   title = {Transformer Architecture: The Positional Encoding - Amirhossein Kazemnejad's Blog},
-  urldate = {2021-12-28},
-  howpublished = {https://kazemnejad.com/blog/transformer\_architecture\_positional\_encoding/}
+  url = {https://kazemnejad.com/blog/transformer_architecture_positional_encoding/},
+  urldate = {2021-12-28}
 }
 
 @misc{TransformersLucasBeyer,
   title = {Transformers with Lucas Beyer, Google Brain - {{YouTube}}},
-  urldate = {2022-10-27},
-  howpublished = {https://www.youtube.com/watch?v=EixI6t5oif0}
+  url = {https://www.youtube.com/watch?v=EixI6t5oif0},
+  urldate = {2022-10-27}
 }
 
 @article{tsaiPredictingStockReturns2011,
@@ -4247,8 +4112,8 @@ @misc{tuningplaybookgithub
   title = {Deep {{Learning Tuning Playbook}}},
   author = {Godbole, Varun and Dahl, George E. and Gilmer, Justin and Shallue, Christopher J. and Nado, Zachary},
   year = {2023},
-  urldate = {2023-06-01},
-  howpublished = {http://github.com/google-research/tuning\_playbook}
+  url = {http://github.com/google-research/tuning_playbook},
+  urldate = {2023-06-01}
 }
 
 @article{tunstallNaturalLanguageProcessing2022,
@@ -4261,9 +4126,7 @@ @misc{turnerBayesianOptimizationSuperior2021
   title = {Bayesian Optimization Is Superior to Random Search for Machine Learning Hyperparameter Tuning: Analysis of the Black-Box Optimization Challenge 2020},
   author = {Turner, Ryan and Eriksson, David and McCourt, Michael and Kiili, Juha and Laaksonen, Eero and Xu, Zhen and Guyon, Isabelle},
   year = {2021},
-  number = {arXiv:2104.10201},
   eprint = {2104.10201},
-  publisher = {{arXiv}},
   archiveprefix = {arxiv}
 }
 
@@ -4284,6 +4147,7 @@ @inproceedings{ucarSubTabSubsettingFeatures2021
   year = {2021},
   volume = {34},
   publisher = {{Curran Associates, Inc.}},
+  url = {https://proceedings.neurips.cc/paper/2021/hash/9c8661befae6dbcd08304dbf4dcaf0db-Abstract.html},
   urldate = {2022-10-27}
 }
 
@@ -4321,9 +4185,8 @@ @misc{vasuImprovedOneMillisecond2022
   title = {An Improved One Millisecond Mobile Backbone},
   author = {Vasu, Pavan Kumar Anasosalu and Gabriel, James and Zhu, Jeff and Tuzel, Oncel and Ranjan, Anurag},
   year = {2022},
-  number = {arXiv:2206.04040},
   eprint = {2206.04040},
-  publisher = {{arXiv}},
+  url = {http://arxiv.org/abs/2206.04040},
   urldate = {2022-07-28},
   archiveprefix = {arxiv}
 }
@@ -4331,7 +4194,7 @@ @misc{vasuImprovedOneMillisecond2022
 @inproceedings{vaswaniAttentionAllYou2017,
   title = {Attention Is All You Need},
   booktitle = {Proceedings of the 32nd {{International Conference}} on {{Neural Information Processing Systems}}},
-  author = {Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N. and Kaiser, Łukasz and Polosukhin, Illia},
+  author = {Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N. and Kaiser, {\L}ukasz and Polosukhin, Illia},
   year = {2017},
   series = {{{NeurIPS}} 2017},
   volume = {30},
@@ -4357,6 +4220,7 @@ @inproceedings{vigInvestigatingGenderBias2020
   year = {2020},
   volume = {33},
   publisher = {{Curran Associates, Inc.}},
+  url = {https://proceedings.neurips.cc/paper/2020/hash/92650b2e92217715fe312e6fa7b90d82-Abstract.html},
   urldate = {2023-01-29}
 }
 
@@ -4423,9 +4287,8 @@ @misc{wangLinformerSelfattentionLinear2020
   title = {Linformer: Self-Attention with Linear Complexity},
   author = {Wang, Sinong and Li, Belinda Z. and Khabsa, Madian and Fang, Han and Ma, Hao},
   year = {2020},
-  number = {arXiv:2006.04768},
   eprint = {2006.04768},
-  publisher = {{arXiv}},
+  url = {http://arxiv.org/abs/2006.04768},
   urldate = {2022-12-10},
   archiveprefix = {arxiv}
 }
@@ -4434,21 +4297,21 @@ @incollection{wangPerceivingNextChoice2017
   title = {Perceiving the next Choice with Comprehensive Transaction Embeddings for Online Recommendation},
   booktitle = {Machine {{Learning}} and {{Knowledge Discovery}} in {{Databases}}},
   author = {Wang, Shoujin and Hu, Liang and Cao, Longbing},
-  editor = {Ceci, Michelangelo and Hollmén, Jaakko and Todorovski, Ljupčo and Vens, Celine and Džeroski, Sašo},
+  editor = {Ceci, Michelangelo and Hollm{\'e}n, Jaakko and Todorovski, Ljup{\v c}o and Vens, Celine and D{\v z}eroski, Sa{\v s}o},
   year = {2017},
   volume = {10535},
   publisher = {{Springer International Publishing}},
   address = {{Cham}},
-  doi = {10.1007/978-3-319-71246-8_18},
+  doi = {10.1007/978-3-319-71246-8\_18},
   urldate = {2021-05-04}
 }
 
-@article{wangSurveySessionbasedRecommender2020,
+@misc{wangSurveySessionbasedRecommender2020,
   title = {A Survey on Session-Based Recommender Systems},
   author = {Wang, Shoujin and Cao, Longbing and Wang, Yan and Sheng, Quan Z. and Orgun, Mehmet and Lian, Defu},
   year = {2020},
-  journal = {arXiv:1902.04864 [cs]},
   eprint = {1902.04864},
+  url = {http://arxiv.org/abs/1902.04864},
   urldate = {2021-04-22},
   archiveprefix = {arxiv}
 }
@@ -4462,15 +4325,14 @@ @misc{wangWizMapScalableInteractive2023
   title = {{{WizMap}}: {{Scalable Interactive Visualization}} for {{Exploring Large Machine Learning Embeddings}}},
   author = {Wang, Zijie J. and Hohman, Fred and Chau, Duen Horng},
   year = {2023},
-  number = {arXiv:2306.09328},
   eprint = {2306.09328},
-  publisher = {{arXiv}},
+  url = {http://arxiv.org/abs/2306.09328},
   urldate = {2023-06-26},
   archiveprefix = {arxiv}
 }
 
 @article{waszczukAssemblingInternationalEquity2014,
-  title = {Assembling International Equity Datasets – Review of Studies on the Cross-Section of Returns},
+  title = {Assembling International Equity Datasets \textendash{} Review of Studies on the Cross-Section of Returns},
   author = {Waszczuk, Antonina},
   year = {2014},
   journal = {Procedia Economics and Finance},
@@ -4483,9 +4345,8 @@ @misc{weiTheoreticalAnalysisSelfTraining2022
   title = {Theoretical Analysis of Self-Training with Deep Networks on Unlabeled Data},
   author = {Wei, Colin and Shen, Kendrick and Chen, Yining and Ma, Tengyu},
   year = {2022},
-  number = {arXiv:2010.03622},
   eprint = {2010.03622},
-  publisher = {{arXiv}},
+  url = {http://arxiv.org/abs/2010.03622},
   urldate = {2023-03-31},
   archiveprefix = {arxiv}
 }
@@ -4504,9 +4365,9 @@ @misc{wengLearningNotEnough2021
   title = {Learning with Not Enough Data Part 1: Semi-Supervised Learning},
   author = {Weng, Lilian},
   year = {2021},
+  url = {https://lilianweng.github.io/posts/2021-12-05-semi-supervised/},
   urldate = {2022-10-13},
-  chapter = {posts},
-  howpublished = {https://lilianweng.github.io/posts/2021-12-05-semi-supervised/}
+  chapter = {posts}
 }
 
 @misc{WhenMachinesTrade,
@@ -4515,37 +4376,34 @@ @misc{WhenMachinesTrade
   urldate = {2023-05-01}
 }
 
-@misc{wiegreffeAttentionNotNot2019,
-  title = {Attention Is Not Not Explanation},
+@inproceedings{wiegreffeAttentionNotNot2019,
+  title = {Attention Is Not Not {{Explanation}}},
+  booktitle = {Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing ({{EMNLP-IJCNLP}})},
   author = {Wiegreffe, Sarah and Pinter, Yuval},
   year = {2019},
-  number = {arXiv:1908.04626},
-  eprint = {1908.04626},
-  publisher = {{arXiv}},
-  urldate = {2023-01-08},
-  archiveprefix = {arxiv}
+  publisher = {{Association for Computational Linguistics}},
+  address = {{Hong Kong, China}},
+  doi = {10.18653/v1/D19-1002}
 }
 
 @misc{wuMemorizingTransformers2022,
   title = {Memorizing Transformers},
   author = {Wu, Yuhuai and Rabe, Markus N. and Hutchins, DeLesley and Szegedy, Christian},
   year = {2022},
-  number = {arXiv:2203.08913},
   eprint = {2203.08913},
-  publisher = {{arXiv}},
+  url = {http://arxiv.org/abs/2203.08913},
   urldate = {2023-01-17},
   archiveprefix = {arxiv}
 }
 
-@misc{xiongLayerNormalizationTransformer2020,
+@inproceedings{xiongLayerNormalizationTransformer2020,
   title = {On Layer Normalization in the Transformer Architecture},
+  booktitle = {Proceedings of the 37th International Conference on Machine Learning},
   author = {Xiong, Ruibin and Yang, Yunchang and He, Di and Zheng, Kai and Zheng, Shuxin and Xing, Chen and Zhang, Huishuai and Lan, Yanyan and Wang, Liwei and Liu, Tie-Yan},
   year = {2020},
-  number = {arXiv:2002.04745},
-  eprint = {2002.04745},
-  publisher = {{arXiv}},
-  urldate = {2022-12-30},
-  archiveprefix = {arxiv}
+  volume = {21},
+  publisher = {{PMLR}},
+  address = {{Online}}
 }
 
 @article{yangStockPricePrediction2021,
@@ -4574,9 +4432,7 @@ @misc{yaoZeroQuantEfficientAffordable2022
   title = {{{ZeroQuant}}: Efficient and Affordable Post-Training Quantization for Large-Scale Transformers},
   author = {Yao, Zhewei and Aminabadi, Reza Yazdani and Zhang, Minjia and Wu, Xiaoxia and Li, Conglong and He, Yuxiong},
   year = {2022},
-  number = {arXiv:2206.01861},
   eprint = {2206.01861},
-  publisher = {{arXiv}},
   doi = {10.48550/arXiv.2206.01861},
   urldate = {2022-11-23},
   archiveprefix = {arxiv}
@@ -4624,9 +4480,8 @@ @misc{zengAreTransformersEffective2022
   title = {Are Transformers Effective for Time Series Forecasting?},
   author = {Zeng, Ailing and Chen, Muxi and Zhang, Lei and Xu, Qiang},
   year = {2022},
-  number = {arXiv:2205.13504},
   eprint = {2205.13504},
-  publisher = {{arXiv}},
+  url = {http://arxiv.org/abs/2205.13504},
   urldate = {2022-12-24},
   archiveprefix = {arxiv}
 }
@@ -4636,13 +4491,10 @@ @article{zhaiDirectBoostingApproach
   author = {Zhai, Shaodan and Xia, Tian and Li, Zhongliang and Wang, Shaojun}
 }
 
-@article{zhangDiveDeepLearning2021,
+@misc{zhangDiveDeepLearning2021,
   title = {Dive into Deep Learning},
   author = {Zhang, Aston and Lipton, Zachary C and Li, Mu and Smola, Alexander J},
-  year = {2021},
-  journal = {arXiv:2106.11342},
-  eprint = {2106.11342},
-  archiveprefix = {arxiv}
+  year = {2021}
 }
 
 @article{zhangUptodateComparisonStateoftheart2017,
@@ -4664,9 +4516,7 @@ @misc{zhuClusteringStructureMicrostructure2021
   title = {Clustering Structure of Microstructure Measures},
   author = {Zhu, Liao and Sun, Ningning and Wells, Martin T.},
   year = {2021},
-  number = {arXiv:2107.02283},
   eprint = {2107.02283},
-  publisher = {{arXiv}},
   archiveprefix = {arxiv}
 }
 
@@ -4681,17 +4531,15 @@ @inproceedings{zophRethinkingPretrainingSelftraining2020
   author = {Zoph, Barret and Ghiasi, Golnaz and Lin, Tsung-Yi and Cui, Yin and Liu, Hanxiao and Cubuk, Ekin Dogus and Le, Quoc},
   year = {2020},
   volume = {33},
-  publisher = {{Curran Associates, Inc.}},
-  urldate = {2023-03-26}
+  publisher = {{Curran Associates, Inc.}}
 }
 
 @misc{zouStockMarketPrediction2022,
   title = {Stock Market Prediction via Deep Learning Techniques: A Survey},
   author = {Zou, Jinan and Zhao, Qingying and Jiao, Yang and Cao, Haiyao and Liu, Yanxi and Yan, Qingsen and Abbasnejad, Ehsan and Liu, Lingqiao and Shi, Javen Qinfeng},
   year = {2022},
-  number = {arXiv:2212.12717},
   eprint = {2212.12717},
-  publisher = {{arXiv}},
+  url = {http://arxiv.org/abs/2212.12717},
   urldate = {2022-12-29},
   archiveprefix = {arxiv}
 }
diff --git a/reports/Content/data-preprocessing.tex b/reports/Content/data-preprocessing.tex
index 36b2e307..529cd073 100644
--- a/reports/Content/data-preprocessing.tex
+++ b/reports/Content/data-preprocessing.tex
@@ -1,4 +1,4 @@
-\addtocontents{toc}{\protect\newpage}
+% \addtocontents{toc}{\protect\newpage}
 \section{Empirical Study}\label{sec:empirical-study}
 
 In this section, we demonstrate the efficacy of machine learning for trade classification in an empirical setting. We begin by outlining the dataset construction.
@@ -13,10 +13,10 @@ \subsubsection{Data Collection}\label{sec:data-collection}
 
 Testing the empirical accuracy of our approaches requires option trades where the true initiator is known. To arrive at a labeled sample, we combine data from four individual data sources. Our primary source is LiveVol, which records option trades executed at US option exchanges at a transaction level. We limit our focus to option trades executed at the \gls{CBOE} and \gls{ISE}. LiveVol contains both trade and matching quote data. Like most proprietary data sources, it does not distinguish the initiator nor does it include the involved trader types. For the \gls{CBOE} and \gls{ISE} exchange, the \gls{ISE} Open/Close Trade Profile and \gls{CBOE} Open-Close Volume Summary contain the buy and sell volumes for the option series by trader type aggregated on a daily level. A combination of the LiveVol dataset with the open/close data, allows us to infer the trade initiator for a subset of trades. For evaluation and use in some of our machine learning models, we acquire additional underlying and option characteristics from IvyDB's OptionMetrics.
 
-In \cref{sec:trade-initiator} we discussed three views on the trade initiator. Due to the absence of order entry times or order types in our data sources, we define the trade initiator based on the position relative to the market maker, who caters to the liquidity demand. Specifically, we classify customer trades as buyer-initiated if the trade is due to a customer buy order and as seller-initiated for customer sales. As previous literature, e.g., \textcite[][4276]{garleanuDemandBasedOptionPricing2009} suggests that trader types, for example, proprietary traders, have a similar role to market makers by supplying liquidity, we limit our analysis to trades between customers and market makers for which the picture is unambiguous. Our definition is consistent with the of \textcite[][8]{grauerOptionTradeClassification2022}.
+In \cref{sec:trade-initiator} we discussed three views on the trade initiator. Due to the absence of order entry times or order types in our data sources, we define the trade initiator based on the position relative to the market maker, who caters to the liquidity demand. Specifically, we classify customer trades as buyer-initiated if the trade is due to a customer buy order and as seller-initiated for customer sales. As previous literature, e.g., \textcite[\checkmark][4276]{garleanuDemandBasedOptionPricing2009} suggests that trader types, for example, proprietary traders, have a similar role to market makers by supplying liquidity, we limit our analysis to trades between customers and market makers for which the picture is unambiguous. Our definition is consistent with \textcite[\checkmark][8]{grauerOptionTradeClassification2022}.
 
 
-Our sample construction follows \textcite[][7--9]{grauerOptionTradeClassification2022}, fostering comparability between both works. We acquire transaction-level options trade data for all major US exchanges from LiveVol. The dataset is tabular, and each record is time-stamped to the second. For each transaction, the executing exchange, trade price, trade volume, quotes and quote sizes for the exchanges where the option is quoted, as well as the \gls{NBBO} are recorded. This is sufficient to estimate the quote rule, depth rule, and trade size rule. In addition, for tick-based algorithms, we add the previous and subsequent distinguishable trade prices. We can uniquely identify the traded option series from a distinct key consisting of the underlying, expiration date, option type and strike price. Our analysis is conducted on transactions at the \gls{ISE} and \gls{CBOE}. To purge the data of potential errors, we filter out:
+Our sample construction follows \textcite[\checkmark][7--9]{grauerOptionTradeClassification2022}, fostering comparability between both works. We acquire transaction-level options trade data for all major US exchanges from LiveVol. The dataset is tabular and each record is time-stamped to the second. For each transaction, the executing exchange, trade price, trade volume, quotes, and quote sizes for the exchanges where the option is quoted, as well as the \gls{NBBO} are recorded. This is sufficient to estimate the quote rule, depth rule, and trade size rule. In addition, for tick-based algorithms, we add the previous and subsequent distinguishable trade prices. We can uniquely identify the traded option series from a distinct key consisting of the underlying, expiration date, option type, and strike price. Our analysis is conducted on transactions at the \gls{ISE} and \gls{CBOE}. To purge the data of potential errors, we filter out:
 \begin{enumerate}[label=(\roman*),noitemsep]
     \item trades with a trade price $\leq \SI{0}[\$]{}$,
     \item trades with a trade volume $\leq 0$ or $\ge \num{10000000}$ contracts,
@@ -25,19 +25,19 @@ \subsubsection{Data Collection}\label{sec:data-collection}
 \end{enumerate}
 
 
-The open/close datasets for the \gls{ISE} and \gls{CBOE} contain the daily buy and sell volumes for the option series by trader type, the trade volume and whether a position was closed or opened. Four trader types are available: customer, professional customer, broker/dealer, and firm proprietary. Customer orders are placed by a retail trader or a member of the exchange on behalf of the customer. Professional customers are distinguished from the former by a high trading activity ($\geq390$ orders per day over one month period). Likewise, trades by a member are classified as proprietary, if executed for their account or broker/dealer if placed for non-members of the exchange \autocite[][2]{nasdaqincFrequentlyAskedQuestions2017}. Trades of customers and professional customers are detailed by trade volume ($\leq 100$; 101--199; $> 199$ contracts). As well as, if a position is newly opened or closed. We first sum buy and sell orders of all trader types and volumes to obtain the daily trading volumes at the \gls{ISE} or \gls{CBOE} per option series and day. Separately for the customer buy and sell volumes, we calculate the daily aggregates identified by the account type customer.
+The open/close datasets for the \gls{ISE} and \gls{CBOE} contain the daily buy and sell volumes for the option series by trader type, the trade volume, and whether a position was closed or opened. Four trader types are available: customer, professional customer, broker/dealer, and firm proprietary. Customer orders are placed by a retail trader or a member of the exchange on behalf of the customer. Professional customers are distinguished from the former by a high trading activity ($\geq390$ orders per day over one month period). Likewise, trades by a member are classified as proprietary, if executed for their account or broker/dealer if placed for non-members of the exchange \autocite[\checkmark][2]{nasdaqincFrequentlyAskedQuestions2017}. Trades of customers and professional customers are detailed by trade volume ($\leq 100$; 101--199; $> 199$ contracts). As well as, if a position is newly opened or closed. We first sum buy and sell orders of all trader types and volumes to obtain the daily trading volumes at the \gls{ISE} or \gls{CBOE} per option series and day. Separately for the customer buy and sell volumes, we calculate the daily aggregates identified by the account type customer.
 
 To infer the true label, we exploit that, if there were only customer buy or sell orders, hence the customer buy or sell volume equals the daily trading volume, we can confidently sign all transactions for the option series at the specific date and exchange as either buyer- or seller-initiated. Our labeling approach fails in the presence of non-customer or simultaneous customer buy or sell trades. The so-obtained trade initiator is merged with the LiveVol trades of the exchange based on the unique key for the option series.
 
 For the \gls{ISE} trades, our matched sample spans from 2 May 2005 to 31 May 2017 and includes \num{49203747} trades. The period covers the full history of \gls{ISE} open/close data up to the last date the dataset was available to us. Our matched \gls{CBOE} sample consists of \num{37155412} trades between 1 January 2011 and 31 October 2017. The sample period is governed by a paradigm shift in the construction of the \gls{CBOE} open/close dataset and the most recent trade in our LiveVol subscription.
 
-Following our initial rationale to explore semi-supervised methods, we reserve unlabeled trades between 24 October 2012 and 24 October 2013 at the \gls{ISE} for pre- and self-training. We provide further details in \cref{sec:train-test-split}. Since LiveVol doesn't distinguish by trader types, this dataset includes both customer and non-customer trades, as well as simultanous buy and sell trades on the same day. Within this period, we filter out trades for which the true label can be inferred to avoid overlap with the supervised dataset. This is crucial for self-training, where labeled and unlabeled data are presented to the model simultaneously.
+Following our initial rationale to explore semi-supervised methods, we reserve unlabeled trades between 24 October 2012 and 24 October 2013 at the \gls{ISE} for pre- and self-training. We provide further details in \cref{sec:train-test-split}. Since LiveVol doesn't distinguish by trader types, this dataset includes both customer and non-customer trades, as well as simultaneous buy and sell trades on the same day. Within this period, we filter out trades for which the true label can be inferred to avoid overlap with the supervised dataset. This is crucial for self-training, where labeled and unlabeled data are presented to the model simultaneously.
 
 While our procedure makes the inference of the true trade initiator partly feasible, concerns regarding a selection bias due to the excessive filtering have to be raised. We address these concerns and report summary statistics for unmerged and merged sub-samples in \cref{app:summary-statistics}. In the following chapter, we motivate feature engineering, present our feature sets and discuss strategies for transforming features into a form that accelerates the training of our models.
 
 \subsubsection{Data Preprocessing}\label{sec:data-preprocessing}
 
-Classical algorithms infer the initiator of the trade from the \emph{raw} price and quote data. We employ feature engineering to pre-process input data and enhance the convergence and performance of our machine-learning models. Gradient-boosted trees and neural networks, though, flexible estimators have limitations in synthesizing new features from existing ones, as demonstrated in empirical work on synthetic data by \textcite[][5--6]{heatonEmpiricalAnalysisFeature2016}. Specifically, ratios, standard deviations, and differences can be difficult for these models to learn and must therefore be engineered beforehand.
+Classical algorithms infer the initiator of the trade from the \emph{raw} price and quote data. We employ feature engineering to pre-process input data and enhance the convergence and performance of our machine-learning models. Gradient-boosted trees and neural networks, though, flexible estimators have limitations in synthesizing new features from existing ones, as demonstrated in empirical work on synthetic data by \textcite[\checkmark][4--6]{heatonEmpiricalAnalysisFeature2016}. Specifically, ratios, standard deviations, and differences can be difficult for these models to learn and must therefore be engineered beforehand.
 
 \textbf{Features and Feature Sets}
 
@@ -99,28 +99,20 @@ \subsubsection{Data Preprocessing}\label{sec:data-preprocessing}
 \end{ThreePartTable}
 
 
-% Issue Type = the type of security: 
-% - 0 = Common Stock 
-% - A = Market index 
-% - 7 = Mutual or investment trust fund 
-% - F = ADR/ADS 
-% - % = Exchange-traded fund 
-% - (blank) = Unspecified
-
 Features and feature sets are documented in \cref{tab:feature-sets}.
-We aid the models by estimating the change in trade price between the previous and successive distinguishable trades. This is identical to the criterion in the (reverse) tick rule, but in a non-quantized fashion to enforce a richer decision boundary and to surpass hard cut-off points. Similarly, the proximity of the trade price to the quotes, which is the decisive criterion in the quote rule and hybrids' thereof is added. The feature value ranges from $\left(-\infty,\infty\right)$ and is $-1$ for trades at the bid, 0 for trades at the mid, and 1 for trades at the ask. Quotes and trade prices are also incorporated as-is.
+We aid the models by estimating the change in trade price between the previous and successive distinguishable trades. This is identical to the criterion in the (reverse) tick rule, but in a non-quantized fashion to enforce a richer decision boundary and to surpass hard cut-off points. Similarly, the proximity of the trade price to the quotes, which is the decisive criterion in the quote rule and hybrids thereof is added. The feature value ranges from $\left(-\infty,\infty\right)$ and is $-1$ for trades at the bid, 0 for trades at the mid, and 1 for trades at the ask. Quotes and trade prices are also incorporated as-is.
 
-Our second feature set, named \gls{FS} size, extends the first feature set by the trade size and quoted sizes, required to estimate hybrid rules involving the depth rule and trade size rule. Both rules achieve state-of-the-art performance on option trades when paired with hybrid algorithms and are thus a viable source of features. We model the depth rule as the ratio between ask and bid sizes and the trade size rule as the ratio between the size of the trade and the quoted bid and ask sizes. Since features are not discretised, we obtain a generic formulation of the trade size rule, where part of the quoted size can remain unfilled. This potentially helps to distinguish limit orders from market orders. The trade price and midspread required for the depth rule are already encompassed in the first feature set. More generically, trade size is known to strongly affect classification. For instance, \textcites[][889]{savickasInferringDirectionOption2003}[][537]{ellisAccuracyTradeClassification2000} report that better classification is associated with smaller trades, as smaller trades are more likely to be executed at the quotes. By providing the model with the trade and quoted sizes we hope to make these nuances learnable.
+Our second feature set, named \gls{FS} size, extends the first feature set by the trade size and quoted sizes, required to estimate hybrid rules involving the depth rule and trade size rule. Both rules achieve state-of-the-art performance on option trades when paired with hybrid algorithms and are thus a viable source of features. We model the depth rule as the ratio between ask and bid sizes and the trade size rule as the ratio between the size of the trade and the quoted bid and ask sizes. Since features are not discretized, we obtain a generic formulation of the trade size rule, where part of the quoted size can remain unfilled. This potentially helps to distinguish limit orders from market orders. The trade price and midspread required for the depth rule are already encompassed in the first feature set. More generically, trade size is known to strongly affect classification. For instance, \textcites[\checkmark][889]{savickasInferringDirectionOption2003}[\checkmark][537]{ellisAccuracyTradeClassification2000} report that better classification is associated with smaller trades, as smaller trades are more likely to be executed at the quotes. By providing the model with the trade and quoted sizes we hope to make these nuances learnable.
 
-Our largest feature set, abbreviated with \gls{FS} option, also incorporates option characteristics, including the option type among others. By providing unique identifiers for the option series, we can potentially establish connections between transactions when trade initiators divide a single order into sub-orders or rely on complex trades. Similar reasoning applies to the daily volume of the option series. Option features are also informative individually. Time to maturity $\tau_{i,t}$, estimated in months, indirectly affects classification performance. On \gls{CBOE} data in \textcite[][889]{savickasInferringDirectionOption2003}, trades with longer maturities are smaller, hence more likely to be classified correctly. Moreover, time-of-maturity can be used as a dummy to identify rollovers \autocite[][700]{muravyevOrderFlowExpected2016}. When investors are short in call or put options, they replace expiring for non-expiring options, which creates selling pressure in the non-expiring option. The feature could make the procedure learnable. Related to the time-to-maturity is moneyness, estimated as the ratio between the price of the underlying $S_{i,t}$ and the strike price $K_{i}$ for calls and the reciprocal for puts. As moneyness is linked to leverage in the investment, we reason that incentives to initiate a trade might vary between buyers and sellers. The classification of index options poses challenges for traditional approaches relative to other security types (cp. \textcites[][22]{grauerOptionTradeClassification2022}[][898-899]{savickasInferringDirectionOption2003}), we equip the models with the security type, as well as the option type and root to extend the learnable context.
+Our largest feature set, abbreviated with \gls{FS} option, also incorporates option characteristics, including the option type among others. By providing unique identifiers for the option series, we can potentially establish connections between transactions when trade initiators divide a single order into sub-orders or rely on complex trades. Similar reasoning applies to the daily volume of the option series. Option features are also informative individually. Time to maturity $\tau_{i,t}$, estimated in months, indirectly affects classification performance. On \gls{CBOE} data in \textcite[\checkmark][889]{savickasInferringDirectionOption2003}, trades with longer maturities are smaller, hence more likely to be classified correctly. Moreover, time-of-maturity can be used as a dummy to identify rollovers \autocite[\checkmark][700]{muravyevOrderFlowExpected2016}. When investors are short in call or put options, they replace expiring for non-expiring options, which creates selling pressure in the non-expiring option. The feature could make the procedure learnable. Related to the time-to-maturity is moneyness, estimated as the ratio between the price of the underlying $S_{i,t}$ and the strike price $K_{i}$ for calls and the reciprocal for puts. As moneyness is linked to leverage in the investment, we reason that incentives to initiate a trade might vary between buyers and sellers. The classification of index options poses challenges for traditional approaches relative to other security types (cp. \textcites[\checkmark][22]{grauerOptionTradeClassification2022}[\checkmark][898-899]{savickasInferringDirectionOption2003}), we equip the models with the security type, as well as the option type and root to extend the learnable context.
 
-Arguably, our models have simultaneous access to the previous and successive trade prices and quotes for both the exchange and the \gls{NBBO}, which is an advantage over base rules. As we benchmark against various, stacked hybrid rules, the data requirements are still comparable. We emphasize this aspect, as it is neglected in previous works \autocites[][485]{blazejewskiLocalNonParametricModel2005}[][48]{ronenMachineLearningTrade2022}[][9]{rosenthalModelingTradeDirection2012}.
+Arguably, our models have simultaneous access to the previous and successive trade prices and quotes for both the exchange and the \gls{NBBO}, which is an advantage over base rules. As we benchmark against various, stacked hybrid rules, the data requirements are still comparable. We emphasize this aspect, as it is neglected in previous works \autocites[\checkmark][485]{blazejewskiLocalNonParametricModel2005}[\checkmark][48]{ronenMachineLearningTrade2022}[\checkmark][398]{rosenthalModelingTradeDirection2012}.
 
 \textbf{Numerical Features}
 
-Pricing or quote data can often not be fully reconstructed, resulting in missing values across all features. Decision trees and ensembles thereof can inherently handle $\mathtt{[NaN]}$ values by discarding missing values in the splitting procedure \autocite[][150--152]{breimanClassificationRegressionTrees2017} or by incorporating missing values into the splitting criterion \autocite[][951]{twalaGoodMethodsCoping2008}. Transformers require missing values to be imputed beforehand, as a $\mathtt{[NaN]}$ value cannot be propagated through the network. We choose zero imputation for being a single-pass strategy that minimizes data leakage and allows \glspl{GBRT} and neural networks to separate imputed values from observed ones. With a low degree of missing values, the impact on the final result is minuscule.
+Pricing or quote data can often not be fully reconstructed, resulting in missing values across all features. Decision trees and ensembles thereof can inherently handle $\mathtt{[NaN]}$ values by discarding missing values in the splitting procedure \autocite[\checkmark][150--152]{breimanClassificationRegressionTrees2017} or by incorporating missing values into the splitting criterion \autocite[\checkmark][951]{twalaGoodMethodsCoping2008}. Transformers require missing values to be imputed beforehand, as a $\mathtt{[NaN]}$ value cannot be propagated through the network. We choose zero imputation for being a single-pass strategy that minimizes data leakage and allows \glspl{GBRT} and neural networks to separate imputed values from observed ones. With a low degree of missing values, the impact on the final result is minuscule.
 
-Price and size-related features exhibit positive skewness. Tree-based learners are unaffected by the feature scale, as the splitting process is based on the quality of the split but not on the scale of splitting value (cp. \cref{sec:decision-tree}). To avoid the tails of the distribution dominating the weight updates of neural networks, we apply power transformations, which transform the distribution of features to be Gaussian-like. Apart from quantization effects, \glspl{GBRT} are unaffected. We determine the power transformation using the Box-Cox procedure \autocite[][214]{boxAnalysisTransformations2022}, given by:
+Price and size-related features exhibit positive skewness. Tree-based learners are unaffected by the feature scale, as the splitting process is based on the quality of the split but not on the scale of splitting value (cp. \cref{sec:decision-tree}). To avoid the tails of the distribution dominating the weight updates of neural networks, we apply power transformations, which transform the distribution of features to be Gaussian-like. Apart from quantization effects, \glspl{GBRT} are unaffected. We determine the power transformation using the Box-Cox procedure \autocite[\checkmark][214]{boxAnalysisTransformations2022}, given by:
 
 \begin{equation}
     \mathbf{X}^{*}\left[:,j\right]= \begin{cases}\frac{1}{\lambda}(\mathbf{X}\left[:,j\right]^\lambda-1), & \lambda \neq 0 \\ \log (\mathbf{X}\left[:,j\right]),& \lambda=0\end{cases}.
@@ -132,7 +124,7 @@ \subsubsection{Data Preprocessing}\label{sec:data-preprocessing}
 
 In experimental tests, features derived as ratios, such as the proximity to quotes, pose a particular challenge for training the FT-Transformer. We observe extreme outliers dominate the gradient update, leading to unstable gradients and poor convergence. We resolve the issue by clipping to a range $[-3,3]$.
 
-To further improve the convergence of Transformers, we normalize all numerical features using $z$-score normalization to obtain zero mean and unit variance. Intuitionally, the zero means prevents bias in the direction of the weight update and scaling to unit variance balances the rate at which parameters are updated \autocite[][8]{lecunEfficientBackProp2012}. Normalization of raw inputs is complementary to batch normalization, which is used in deeper layers of the Transformer stack and single batches. Following good standards, all statistics are estimated on the imputed training set only. The unlabeled \gls{ISE} training set and the \gls{CBOE} test set share the statistics of the \gls{ISE} labeled training set.
+To further improve the convergence of Transformers, we normalize all numerical features using $z$-score normalization to obtain zero mean and unit variance. Intuitionally, the zero means prevents bias in the direction of the weight update, and scaling to unit variance balances the rate at which parameters are updated \autocite[\checkmark][16--17]{lecunEfficientBackProp2012}. Normalization of raw inputs is complementary to batch normalization, which is used in deeper layers of the Transformer stack and single batches. Following good standards, all statistics are estimated on the imputed training set only. The unlabeled \gls{ISE} training set and the \gls{CBOE} test set share the statistics of the \gls{ISE} labeled training set.
 
 Normalization and log transformations have the advantage of preserving the data distribution, which is a desirable property when comparing the feature importances from machine learning models against their classical counterparts in \cref{sec:feature-importance}.
 
@@ -142,19 +134,19 @@ \subsubsection{Data Preprocessing}\label{sec:data-preprocessing}
 
 The option type and issue type are both low-cardinal with two and five unique classes. Differently, the underlying is high-cardinal with more than \num{9107} distinct classes, as options are written on a wide range of underlyings, impacting both the model's tendency to overfit and parameter count. For simplicity in evaluation, we do not remove infrequent categories.
 
-Disadvantages of label encoding, as raised in \textcite[][12]{hancockSurveyCategoricalData2020}, such as the unequal contributions of larger keys to the loss in neural networks or the artificially implied order, do not apply here, as the conversion is followed by sophisticated treatments within the models.
+Disadvantages of label encoding, as raised in \textcite[\checkmark][12]{hancockSurveyCategoricalData2020}, such as the unequal contributions of larger keys to the loss in neural networks or the artificially implied order, do not apply here, as the conversion is followed by sophisticated treatments within the models.
 
 A comprehensive overview of all feature transformations is given in \cref{app:feature-and-transformations}. The next section discusses the train-test split.
 
 \subsubsection{Train-Test Split}\label{sec:train-test-split}
 
-Prior classical works assess the performance of classical rules in-sample \autocite[cp.][541]{ellisAccuracyTradeClassification2000} or in an out-of-sample setting \autocites[cp.][7--9]{grauerOptionTradeClassification2022}[][3814--3815]{chakrabartyTradeClassificationAlgorithms2007}. In the presence of tunable hyperparameters in our classifiers, we separate the \gls{ISE} dataset into three disjoint sets. The training set is used to fit the classifier to the data. The validation set is dedicated to tuning the hyperparameters, and the test set is used for unbiased out-of-sample estimates.
+Prior classical works assess the performance of classical rules in-sample \autocite[cp.][541]{ellisAccuracyTradeClassification2000} or in an out-of-sample setting \autocites[cp.][9]{grauerOptionTradeClassification2022}[\checkmark][3814--3815]{chakrabartyTradeClassificationAlgorithms2007}. In the presence of tunable hyperparameters in our classifiers, we separate the \gls{ISE} dataset into three disjoint sets. The training set is used to fit the classifier to the data. The validation set is dedicated to tuning the hyperparameters, and the test set is used for unbiased out-of-sample estimates.
 
 Trades in the dataset are ordered by time of execution, and nearby trades can be auto-correlated, as documented in \cref{app:autocorrelation-of-features}.
-Prime examples for auto-correlation between trades are market or limit orders, that are split into smaller orders for eager order execution. % Also, informed traders disguise their trading activity by slicing orders into smaller-sized orders, as reported by \textcite[][183]{anandStealthTradingOptions2007}. 
-The resulting, separate transactions are trivial to classify with the true label of a single transaction. This imposes constraints on the train-test split, which must ensure that minimal information leaks into the test set through serially-correlated features, leading to an otherwise overestimated model performance.\footnote{We emphasize this aspect, as previous research of \textcite[][14]{ronenMachineLearningTrade2022} is expectedly affected from this issue leading to exaggerated results.} The violation of statistical independence, out rules methods like the $k$-fold cross-validation or random train-test splits, both of which assume samples to be i.i.d. \autocite[][103--105]{lopezdepradoAdvancesFinancialMachine2018}. Differently, our work statically splits into subsets by time, which maintains the temporal ordering and eschews data leakage. Albeit this limits the model's ability to leverage recent information for prediction beyond the training set's cut-off point. We do not explore dynamic training schemes, as they are practically intractable considering the number of model combinations and computational requirements of Transformers and \glspl{GBRT}. In the absence of an update mechanism, our results can be interpreted as a lower bound.
+Prime examples for auto-correlation between trades are market or limit orders, that are split into smaller orders for eager order execution. 
+The resulting, separate transactions are trivial to classify with the true label of a single transaction. This imposes constraints on the train-test split, which must ensure that minimal information leaks into the test set through serially-correlated features, leading to an otherwise overestimated model performance.\footnote{We emphasize this aspect, as previous research of \textcite[\checkmark][14]{ronenMachineLearningTrade2022} is expectedly affected from this issue leading to exaggerated results.} The violation of statistical independence, out rules methods like the $k$-fold cross-validation or random train-test splits, both of which assume samples to be i.i.d. \autocite[\checkmark][103--105]{lopezdepradoAdvancesFinancialMachine2018}. Differently, our work statically splits into subsets by time, which maintains the temporal ordering and eschews data leakage. Albeit this limits the model's ability to leverage recent information for prediction beyond the training set's cut-off point. We do not explore dynamic training schemes, as they are practically intractable considering the number of model combinations and computational requirements of Transformers and \glspl{GBRT}. In the absence of an update mechanism, our results can be interpreted as a lower bound.
 
-Applying the time-based split, we attribute the first \SI{60}{\percent} of our dataset for training and the next \SI{20}{\percent} each for validation and testing. Days at the split boundary are assigned to either one set to avoid train-test contamination. % Data within the training and validation set may be shuffled to accelerate training.
+Applying the time-based split, we attribute the first \SI{60}{\percent} of our dataset for training and the next \SI{20}{\percent} each for validation and testing. Days at the split boundary are assigned to either one set to avoid train-test contamination.
 
 \begin{figure}[ht]
     \centering
@@ -165,8 +157,8 @@ \subsubsection{Train-Test Split}\label{sec:train-test-split}
 
 Overall, we use \gls{ISE} data from 2 May 2005 to 24 October 2013 to train and data between 25 October 2013 and 5 November 2015 to validate our models. The most recent trades until 31 May 2017 to assess the generalization error.
 
-Models are pre-trained on unlabeled samples from the last year of the training period. Given the significantly larger number of unlabeled customer trades, the pre-training period is reduced to one year to facilitate training on the available computing resources.
+Models are pre-trained on unlabeled samples from the last year of the training period. Given the significantly larger number of unlabeled customer trades, the pre-training period is reduced to half a year to facilitate training on the available computing resources.
 
-We use the \gls{CBOE} sample past 5 November 2015 as a second test set, as visualized in \cref{fig:train-test-split}. Our evaluation approach is the most rigorous as it disallows any form of adaptation of the models, thereby ensuring a rigorous evaluation. Unlike transfer learning techniques such as parameter or model transfer, which expectedly improve model performance, we choose to forgo these techniques and demonstrate the effectiveness of our models without any transfer of knowledge. The start date ensures that leakage from the \gls{ISE} set is minimized.\footnote{The datasets contain features, such as the \gls{NBBO}, that are identical for both sets, assuming trades were executed at both exchanges simultaneously. Also, quotes can be identical between exchanges, if market makers quote at the \gls{NBBO}, which is common practice as documented in \textcite[10]{securitiesandexchangecommissionReportConcerningExaminations2007}. Utilizing the full \gls{CBOE} sample could result in exaggerated performance estimates if the corresponding \gls{ISE} trade is used in training.}
+We use the \gls{CBOE} sample past 5 November 2015 as a second test set, as visualized in \cref{fig:train-test-split}. Our evaluation approach is the most rigorous as it disallows any form of adaptation of the models, thereby ensuring a rigorous evaluation. Unlike transfer learning techniques such as parameter or model transfer, which expectedly improve model performance, we choose to forgo these techniques and demonstrate the effectiveness of our models without any transfer of knowledge. The start date ensures that leakage from the \gls{ISE} set is minimized.\footnote{The datasets contain features, such as the \gls{NBBO}, that are identical for both sets, assuming trades were executed at both exchanges simultaneously. Also, quotes can be identical between exchanges, if market makers quote at the \gls{NBBO}, which is common practice as documented in \textcite[\checkmark][10]{securitiesandexchangecommissionReportConcerningExaminations2007}. Utilizing the full \gls{CBOE} sample could result in exaggerated performance estimates if the corresponding \gls{ISE} trade is used in training.}
 
-Our train-test-split assumes that all subsets are drawn from the same distribution, so fitting a classifier on the training set and optimizing for the validation set provides good estimates for the test set. To validate this assumption, we use adversarial validation. Specifically, we re-label all training samples with $y=-1$ and all trades of the validation set with $y=1$, train a classifier on a random subset of the composed dataset and predict class conformance. The performance is estimated using the \gls{MCC} of \textcite[][445]{matthewsComparisonPredictedObserved1975}, which ranges between $\left[-1, 1\right]$ and is insensitive to class imbalances.\footnote{Classes are imbalanced, due to the training set being three times the size of the validation set.} Assuming train and validation samples are sampled from the same distribution, the performance estimate is near a random guess, or $\operatorname{MCC} = 0$. For the mid-sized feature set, the \gls{MCC} is \num{0.364260805498287} suggesting training and validation sets are approximately similar. The next section discusses techniques used in training the classifiers.
\ No newline at end of file
+Our train-test-split assumes that all subsets are drawn from the same distribution, so fitting a classifier on the training set and optimizing for the validation set provides good estimates for the test set. To validate this assumption, we use adversarial validation. Specifically, we re-label all training samples with $y=-1$ and all trades of the validation set with $y=1$, train a classifier on a random subset of the composed dataset, and predict class conformance. The performance is estimated using the \gls{MCC} of \textcite[\checkmark][445]{matthewsComparisonPredictedObserved1975}, which ranges between $\left[-1, 1\right]$ and is insensitive to class imbalances.\footnote{Classes are imbalanced, due to the training set being three times the size of the validation set.} Assuming train and validation samples are sampled from the same distribution, the performance estimate is near a random guess, or $\operatorname{MCC} = 0$. For the mid-sized feature set, the \gls{MCC} is \num{0.364260805498287} suggesting training and validation sets are approximately similar. The next section discusses techniques used in training the classifiers.
\ No newline at end of file
diff --git a/reports/Content/end.tex b/reports/Content/end.tex
index 97f2548f..1ce48d2b 100644
--- a/reports/Content/end.tex
+++ b/reports/Content/end.tex
@@ -1,33 +1,20 @@
-\section{Discussion}\label{sec:discussion}
+\section{Conclusion and Outlook}\label{sec:conclusion-outlook}
 
-In this study, we applied gradient boosting and the FT-Transformer, two well-established machine learning classifiers, to the task of trade classification in the option market. While our results clearly demonstrate the superior performance of machine learning over rule-based classification, it's important to acknowledge the limitations of our approach.
+% \todo{The predictability results survive an extensive list of robustness checks. Make clear we compare deep learning vs tree-based methods. None is superior.}
 
-Inference of our classifiers is computationally cheap, but training requires a significant amount of compute. To make training feasible at all, great effort is spent on utilizing computing resources by optimizing memory transfers, compute graphs, and data representation. In cases, where computing resources are scarce or classification accuracy is not the primary target, we advocate for heuristics, such as the \gls{GSU} method, which balances between computational cost and performance.
-
-All of our classifiers require some labeled instances for training. If the true label cannot be inferred from trades or generating labeled data is wasteful, our approaches are not applicable. For cases, where trades are partially labeled, our pre-trained FT-Transformer offers a viable alternative to rule-based classification.
-
-\newpage
-\section{Conclusion}\label{sec:conclusion}
-
-\todo{The predictability results survive an extensive list of robustness checks. Make clear we compare deep learning vs tree-based methods. None is superior.}
-
-The goal of this study is to examine the performance of machine learning-based trade classification in the option market. In particular, we propose to model trade classification with Transformers and gradient boosting. Both approaches are supervised and leverage labeled trades. For settings, where labeled trades are scarce, we extend Transformers with a pre-training objective to train on unlabeled trades as well as generate pseudo-labels for gradient boosting through a self-training procedure.
+The goal of this study is to examine the performance of machine learning-based trade classification in the option market. In particular, we propose to model trade classification with Transformers and gradient boosting. Both approaches are supervised and leverage labeled trades. For settings where labeled trades are scarce, we extend Transformers with a pre-training objective to train on unlabeled trades as well as generate pseudo-labels for gradient boosting through a self-training procedure.
 
 Our models establish a new state-of-the-art for trade classification on the \gls{ISE} and \gls{CBOE} dataset. For \gls{ISE} trades, Transformers achieve an accuracy of \SI{63.78}{\percent} when trained on trade and quoted prices as well as \SI{72.58}{\percent} when trained on additional quoted sizes, improving over current best of \textcite[][27]{grauerOptionTradeClassification2022} by \SI{3.73}{\percent} and \SI{4.97}{\percent}. Similarly, \glspl{GBRT} reach accuracies between \SI{63.67}{\percent} and \SI{73.24}{\percent}. We observe performance improvements up to \SI{6.51}{\percent} for \glspl{GBRT} and \SI{6.31}{\percent} for Transformers when models have access to option characteristics. Relative to the ubiquitous tick test, quote rule, and \gls{LR} algorithm, improvements are \SI{23.88}{\percent}, \SI{17.11}{\percent}, and \SI{17.02}{\percent}. Outperformance is particularly strong for \gls{ITM} options, options with a long maturity, as well as options traded at the quotes. Both architectures generalize well on \gls{CBOE} data, with even stronger improvements between \SI{4.92}{\percent} and \SI{7.58}{\percent} over the benchmark depending on the model and feature set. 
 
 In the semi-supervised setting, Transformers on \gls{ISE} dataset profit from pre-training on unlabeled trades with accuracies up to \SI{74.55}{\percent}, but the performance gains slightly diminish on the \gls{CBOE} test set. Vise versa, we observe no benefits from semi-supervised training of \glspl{GBRT}.
-% Consistent with \textcites[][27]{grauerOptionTradeClassification2022}[][901]{savickasInferringDirectionOption2003} we find evidence that the performance of common trade classification rules deteriorates in the option market. In particular, tick-based methods marginally outperform a random guess.
 
 Unlike previous studies, we can trace back the performance of our approaches as well as of trade classification rules to individual features and feature groups using the importance measure \gls{SAGE}. We find that both paradigms attain the largest performance improvements from classifying trades based on quoted sizes and prices, but machine learning-based classifiers attain higher performance gains and effectively exploit the data. The change in the trade price, decisive criteria to the (reverse) tick test, plays no role in option trade classification. We identify the relative illiquidity of options to affect the information content of the surrounding trade prices. Our classifiers profit from the inclusion of option-specific features, like moneyness and time-to-maturity, currently unexploited in classical trade classification.
 
 By probing and visualizing the attention mechanism of the Transformer, we can establish a connection to rule-based classification. Graphically, our results show, that attention heads encode knowledge about rule-based classification. Whilst attention heads in earlier layers of the network broadly attend to all features or their embeddings, later they focus on specific features jointly used in rule-based classification akin to the \gls{LR} algorithm, depth rule or others. Furthermore, embeddings encode domain knowledge. Our results demonstrate exemplary for traded underlying, that the Transformer learns to group similar underlyings in embedding space.
 
-Our classifiers deliver accurate predictions and improved robustness, which effectively reduces noise and bias in option research dependent on reliable trade initiator estimates. When applied to measuring trading cost through effective spreads, the models dominate all rule-based approaches by approximating the true effective spread of options best. Exemplary, the Transformer pre-trained on unlabeled trades estimates a mean spread of  \SI[round-mode=places, round-precision=3]{0.013118}[\$]{} versus \SI[round-mode=places, round-precision=3]{0.004926}[\$]{} actual spread at the \gls{ISE}.
-
-In conclusion, our study showcases the efficacy of machine learning as a viable alternative to existing trade signing algorithms for classifying option trades, if partially-labeled or labeled trades are available for training. % While we tested our models on option trades, we expect the results to transfer to other modalities including equity trades. 
+Our classifiers deliver accurate predictions and improved robustness, which effectively reduces noise and bias in option research dependent on reliable trade initiator estimates. When applied to measuring trading cost through effective spreads, the models dominate all rule-based approaches by approximating the true effective spread of options best. 
 
-\newpage
-\section{Outlook}\label{sec:outlook}
+In conclusion, our study showcases the efficacy of machine learning as a viable alternative to existing trade signing algorithms for classifying option trades, if partially-labeled or labeled trades are available for training.
 
 In future work, we plan to revisit training Transformers on a larger corpus of unlabeled trades through pre-training objectives and study the effects from \emph{exchange-specific} finetuning. While our current results show that pre-training positively drives classification performance, for comparability it is only performed on a small subset of trades and models have not fully converged. Thus, we expect to see benefits from additional data and compute, following the scaling laws of \textcite[][7]{hoffmannTrainingComputeOptimalLarge2022}. The application confers advantages when finetuning is constrained due to the limited availability of the true trade initiator.
 
diff --git a/reports/Content/evaluation.tex b/reports/Content/evaluation.tex
index 46d0c63a..d45dac3a 100644
--- a/reports/Content/evaluation.tex
+++ b/reports/Content/evaluation.tex
@@ -24,19 +24,19 @@ \subsubsection{Feature Importance
     Measure}\label{sec:feature-importance-measure}
 
 Naturally, we aim to gain insights into the prediction process and identify relevant features, which fall under the umbrella of \emph{interpretability}.
-Following, \textcite[][4]{liptonMythosModelInterpretability2017} interpretability can be reached through model transparency or post-hoc interpretability methods. Transparent models provide interpretability through a transparent mechanism in the model, whereas post-hoc methods extract information from the already learned model \autocite[][4--5]{liptonMythosModelInterpretability2017}.
+Following, \textcite[\checkmark][44--45]{liptonMythosModelInterpretability2017} interpretability can be reached through model transparency or post-hoc interpretability methods. Transparent models provide interpretability through a transparent mechanism in the model, whereas post-hoc methods extract information from the already learned model \autocite[\checkmark][44--45]{liptonMythosModelInterpretability2017}.
 
-Classical trade classification algorithms, as a rule-based classifier, are transparent with an easily understandable decision process and thus provide interpretability \autocite[][91]{barredoarrietaExplainableArtificialIntelligence2020}. Interpretability, however, decreases for deep, stacked combinations involving a large feature count, when interactions between base rules become more complex and the effect of a single feature on the final prediction more challenging to interpret.
+Classical trade classification algorithms, as a rule-based classifier, are transparent with an easily understandable decision process and thus provide interpretability \autocite[\checkmark][91]{barredoarrietaExplainableArtificialIntelligence2020}. Interpretability, however, decreases for deep, stacked combinations involving a large feature count, when interactions between base rules become more complex and the effect of a single feature on the final prediction more challenging to interpret.
 
-The machine learning classifiers, studied in this work, can be deemed a black box model \autocite[][90]{barredoarrietaExplainableArtificialIntelligence2020}. Due to the sheer size of the network or ensemble, interpretability through transparency is impacted. Albeit, the attention mechanism of Transformers provides some interpretability through the attention mechanism,  interpretability across all classifiers can only be reached through \emph{model-agnostic, post-hoc interpretability techniques}.
+The machine learning classifiers, studied in this work, can be deemed a black box model \autocite[\checkmark][90]{barredoarrietaExplainableArtificialIntelligence2020}. Due to the sheer size of the network or ensemble, interpretability through transparency is impacted. Albeit, the attention mechanism of Transformers provides some interpretability through the attention mechanism, interpretability across all classifiers can only be reached through \emph{model-agnostic, post-hoc interpretability techniques}.
 
-Thereby, our goal is to estimate how much a feature contributes to the performance of the classifier \emph{overall}, which urges for \emph{global feature attribution measures}. The appropriate approach is guided by the properties of the data. Due to the data-generating process with strongly correlated quotes and trade prices at the exchange and nationwide levels, features are strongly dependent. The redundant feature encoding of ratio features exacerbates this effect. Feature independence, however, is the central assumption of most popular feature importance measures, including \gls{SHAP} or random feature permutation \autocite[][2]{aasExplainingIndividualPredictions2021}. A violation of this constraint for two perfectly correlated, predictive features can have the effect that both are deemed unimportant as the feature importance is distributed between features underestimating the true importance of the feature \autocite[][4]{covertUnderstandingGlobalFeature2020}. For this reason, we estimate feature importances using \gls{SAGE}, which can account for complex interactions between features and yields global importances. 
+Thereby, our goal is to estimate how much a feature contributes to the performance of the classifier \emph{overall}, which urges for \emph{global feature attribution measures}. The appropriate approach is guided by the properties of the data. Due to the data-generating process with strongly correlated quotes and trade prices at the exchange and nationwide levels, features are strongly dependent. The redundant feature encoding of ratio features exacerbates this effect. Feature independence, however, is the central assumption of most popular feature importance measures, including \gls{SHAP} or random feature permutation \autocite[\checkmark][2]{aasExplainingIndividualPredictions2021}. A violation of this constraint for two perfectly correlated, predictive features can have the effect that both are deemed unimportant as the feature importance is distributed between features underestimating the true importance of the feature \autocite[\checkmark][17215]{covertUnderstandingGlobalFeature2020}. For this reason, we estimate feature importances using \gls{SAGE}, which can account for complex interactions between features and yields global importances. 
 
 \textbf{Shapley Additive Global Importance}
 
-\gls{SAGE} is an additive feature importance measure with its foundations in cooperative game theory. As put forth by \textcite[][3]{lundbergUnifiedApproachInterpreting2017} feature contributions can be estimated through Shapley values \autocite[][11]{shapley17ValueNPerson1953}. Instead of allocating credit in a cooperative game to players, as in the original Shapley formulation, the problem transfers to assign credit across features based on a value function. Intuitionally, for \gls{SAGE}, credit is distributed among features based on the contribution to the model's performance.
+\gls{SAGE} is an additive feature importance measure with its foundations in cooperative game theory. As put forth by \textcite[\checkmark][4770]{lundbergUnifiedApproachInterpreting2017} feature contributions can be estimated through Shapley values \autocite[\checkmark][310--312]{shapley17ValueNPerson1953}. Instead of allocating credit in a cooperative game to players, as in the original Shapley formulation, the problem transfers to assign credit across features based on a value function. Intuitionally, for \gls{SAGE}, credit is distributed among features based on the contribution to the model's performance.
 
-Again, $X$ is a random variable describing the input, $Y$ is the response variable, and $f$ is the classifier. In \gls{SAGE} \autocite[][4--5]{covertUnderstandingGlobalFeature2020}, Shapley values $\phi_i(v_f)$ are estimated as:
+Again, $X$ is a random variable describing the input, $Y$ is the response variable, and $f$ is the classifier. In \gls{SAGE} \autocite[\checkmark][17215--17216]{covertUnderstandingGlobalFeature2020}, Shapley values $\phi_i(v_f)$ are estimated as:
 \begin{equation}
     \phi_i(v_f)=\frac{1}{d} \sum_{S \subseteq D \backslash\{i\}}\left(\begin{array}{c}
         d-1 \\
@@ -46,7 +46,7 @@ \subsubsection{Feature Importance
 \end{equation}
 where $D=\left\{1,\ldots,d\right\}$ is a set of feature indices corresponding to the features $x_1,\ldots,x_d$ and $S\subset D$. Intuitionally, \cref{eq:shapley} estimates Shapley value as the weighted average of the incremental change in the value function, $v_f(S)$, before and after the $i$-th feature is added to the feature subsets $S$. Hereby, the first term $\left(\begin{smallmatrix} d-1 \\|S|\end{smallmatrix}\right)^{-1}$ accounts for the possibilities to choose a $|S|$-strong subset from $D \backslash\{i\}$. 
 
-While subsets of features $X_S = \left\{X_i \mid i \in S \right\}$ can be easily constructed, most classifiers, including ours, cannot handle the absence of features as they require fixed-sized inputs during inference. \textcite[][2]{covertUnderstandingGlobalFeature2020} mitigate the issue, by marginalizing out the missing features $\bar{S}=D\backslash S$ using the conditional distribution $p(X_{\bar{S}} \mid X_S=x_S)$. Following \textcite[][4--5]{covertUnderstandingGlobalFeature2020}, the performance of the model for a given subset of features $S$ and loss function $\ell$ can now be estimated by
+While subsets of features $X_S = \left\{X_i \mid i \in S \right\}$ can be easily constructed, most classifiers, including ours, cannot handle the absence of features as they require fixed-sized inputs during inference. \textcite[\checkmark][17213]{covertUnderstandingGlobalFeature2020} mitigate the issue, by marginalizing out the missing features $\bar{S}=D\backslash S$ using the conditional distribution $p(X_{\bar{S}} \mid X_S=x_S)$. Following \textcite[\checkmark][17215--17216]{covertUnderstandingGlobalFeature2020}, the performance of the model for a given subset of features $S$ and loss function $\ell$ can now be estimated by
 \begin{equation}
 v_f(S)=-\mathbb{E}\left[\ell\left(\mathbb{E}\left[f(X) \mid X_S\right], Y\right)\right].
 \end{equation}
@@ -55,13 +55,13 @@ \subsubsection{Feature Importance
 
 \textbf{Attention Maps}
 
-In addition to \gls{SAGE}, Transformer-based models offer \emph{some} interpretability through their attention mechanism. Consistent with \textcite[][8]{wiegreffeAttentionNotNot2019} we view attention scores as a vehicle to model transparency.
+In addition to \gls{SAGE}, Transformer-based models offer \emph{some} interpretability through their attention mechanism. Consistent with \textcite[\checkmark][18]{wiegreffeAttentionNotNot2019} we view attention scores as a vehicle to model transparency.
 
-Recall from our discussion on attention (cp. \cref{sec:attention}) that the attention matrix stores how much attention a token pays to each of the keys. Thus, feature attributions can be derived from attention by visualizing features to which the model attends to in an attention map. While attention maps are specific to Transformers or other attention-based architectures, rendering them useless for cross-model comparisons, they give additional insights from different attention layers and attention heads of the model on a per-trade and global basis.
+Recall from our discussion on attention (cp. \cref{sec:attention}) that the attention matrix stores how much attention a token pays to each of the keys. Thus, feature attributions can be derived from attention by visualizing features, to which the model attends, in an attention map. While attention maps are specific to Transformers or other attention-based architectures, rendering them useless for cross-model comparisons, they give additional insights from different attention layers and attention heads of the model on a per-trade and global basis.
 
-In the tabular domain, various approaches have been investigated in the literature to obtain attention from multiple attention heads and Transformer blocks. \textcite[][18]{somepalliSaintImprovedNeural2021} and \textcite[][11]{borisovDeepNeuralNetworks2022} gather attention maps from the first attention layer only, and \textcite[][11]{borisovDeepNeuralNetworks2022} additionally obtain feature attributions by taking the diagonal of the attention matrix $\mathbf{A}$ or through column-wise summation. In contrast, \textcite[][10]{gorishniyRevisitingDeepLearning2021} leverage all attention matrices by averaging over multiple Transformer blocks, attention heads, and samples to obtain global feature attributions. Given \cref{sec:architectural-overview,sec:attention}, where we emphasized the unique role of attention heads and lower sub-layers, both approaches may be myopic, as attention heads contribute unequally to the result, or as later attention layers are neglected altogether.
+In the tabular domain, various approaches have been investigated in the literature to obtain attention from multiple attention heads and Transformer blocks. \textcite[\checkmark][18]{somepalliSaintImprovedNeural2021} and \textcite[\checkmark][8]{borisovDeepNeuralNetworks2022} gather attention maps from the first attention layer only, and \textcite[\checkmark][8]{borisovDeepNeuralNetworks2022} additionally obtain feature attributions by taking the diagonal of the attention matrix $\mathbf{A}$ or through column-wise summation. In contrast, \textcite[\checkmark][18941]{gorishniyRevisitingDeepLearning2021} leverage all attention matrices by averaging over multiple Transformer blocks, attention heads, and samples to obtain global feature attributions. Given \cref{sec:architectural-overview,sec:attention}, where we emphasized the unique role of attention heads and lower sub-layers, both approaches may be myopic, as attention heads contribute unequally to the result, or as later attention layers are neglected altogether.
 
-While not explored systematically in the tabular domain yet, the rollout attention method of \textcite[][3]{abnarQuantifyingAttentionFlow2020} combines raw attention from multiple layers through recursive matrix multiplication with the weight matrices from attention layers below, as shown in this Equation:\footnote{Notation from adapted from \textcite[][786]{cheferTransformerInterpretabilityAttention2021}.}
+While not explored systematically in the tabular domain yet, the rollout attention method of \textcite[\checkmark][4192]{abnarQuantifyingAttentionFlow2020} combines raw attention from multiple layers through recursive matrix multiplication with the weight matrices from attention layers below, as shown in this Equation:\footnote{Notation from adapted from \textcite[\checkmark][786]{cheferTransformerInterpretabilityAttention2021}.}
 \begin{equation}
     \begin{aligned}
         \hat{\mathbf{A}}^{(l)}    & =\mathbf{I}+\mathbb{E}_h \mathbf{A}^{(l)}                                              \\
@@ -70,9 +70,9 @@ \subsubsection{Feature Importance
     \label{eq:attention-map-rollout}
 \end{equation}
 
-In each layer the raw attention scores $\mathbf{A}^{(l)}$ are averaged over $h$ heads, denoted by $\mathbb{E}_h$. The identity matrix $\mathbf{I}$ is added to account for the residual connections. While rollout attention considers all attention layers in the calculation of feature attributions, it does not consider a signal and attributes equal weights to all attention heads \autocite[][786]{cheferTransformerInterpretabilityAttention2021}.
+In each layer the raw attention scores $\mathbf{A}^{(l)}$ are averaged over $h$ heads, denoted by $\mathbb{E}_h$. The identity matrix $\mathbf{I}$ is added to account for the residual connections. While rollout attention considers all attention layers in the calculation of feature attributions, it does not consider a signal and attributes equal weights to all attention heads \autocite[\checkmark][786]{cheferTransformerInterpretabilityAttention2021}.
 
-In an attempt to explain the decision-making process of multi-modal Transformers, including self-attention-based Transformers, \textcite[][3]{cheferGenericAttentionmodelExplainability2021} incorporate gradients to weight the head's contribution when averaging over the heads of a layer, as shown in \cref{eq:attention-map-weighted}. Like before, all attention layers are considered.
+In an attempt to explain the decision-making process of multi-modal Transformers, including self-attention-based Transformers, \textcite[\checkmark][399]{cheferGenericAttentionmodelExplainability2021} incorporate gradients to weight the head's contribution when averaging over the heads of a layer, as shown in \cref{eq:attention-map-weighted}. Like before, all attention layers are considered.
 
 \begin{equation}
     \begin{aligned}
@@ -82,4 +82,4 @@ \subsubsection{Feature Importance
     \label{eq:attention-map-weighted}
 \end{equation}
 
-In this approach, the element-wise product between the gradient of the attention map $\nabla \mathbf{A}^{(l)}=\frac{\partial y_t}{\partial \mathbf{A}}$ for the model's target class $t$ and the attention map $\mathbf{A}^{(l)}$ is calculated to weight the attention head's importance. As introduced in \textcite[][786]{cheferTransformerInterpretabilityAttention2021}, negative contributions are eliminated to focus on the positive relevance, and the results are averaged over the heads dimension. Like all other presented approaches \cref{eq:attention-map-rollout,eq:attention-map-weighted} can be computed with a single forward pass and is therefore computationally efficient.
\ No newline at end of file
+In this approach, the element-wise product between the gradient of the attention map $\nabla \mathbf{A}^{(l)}=\frac{\partial y_t}{\partial \mathbf{A}}$ for the model's target class $t$ and the attention map $\mathbf{A}^{(l)}$ is calculated to weight the attention head's importance. As introduced in \textcite[\checkmark][786]{cheferTransformerInterpretabilityAttention2021}, negative contributions are eliminated to focus on the positive relevance, and the results are averaged over the heads dimension. Like all other presented approaches \cref{eq:attention-map-rollout,eq:attention-map-weighted} can be computed with a single forward pass and is therefore computationally efficient.
\ No newline at end of file
diff --git a/reports/Content/introduction.tex b/reports/Content/introduction.tex
index 1be51bcf..1432ceb6 100644
--- a/reports/Content/introduction.tex
+++ b/reports/Content/introduction.tex
@@ -1,22 +1,22 @@
 \section{Introduction}\label{sec:introduction}
 
-Every option trade has a buyer and seller side. For a plethora of problems in option research, it’s also crucial to determine the party that initiated the transaction. Common applications include the study of option demand \autocite[][3]{garleanuDemandBasedOptionPricing2009}, of informational content in option trading \autocites[][631]{huDoesOptionTrading2014}[][882]{panInformationOptionVolume2006}[][1079]{caoInformationalContentOption2005}, of order flow \autocite[][684]{muravyevOrderFlowExpected2016}, or trading costs \autocite[][4980]{muravyevOptionsTradingCosts2020}. 
+Every option trade has a buyer and seller side. For a plethora of problems in option research, it’s also crucial to determine the party that initiated the transaction. Common applications include the study of option demand \autocite[\checkmark][4261]{garleanuDemandBasedOptionPricing2009}, of informational content in option trading \autocites[\checkmark][631]{huDoesOptionTrading2014}[\checkmark][882]{panInformationOptionVolume2006}[\checkmark][1079]{caoInformationalContentOption2005}, of order flow \autocite[\checkmark][684]{muravyevOrderFlowExpected2016}, or trading costs \autocite[\checkmark][4980]{muravyevOptionsTradingCosts2020}. 
 
-Despite the clear importance for empirical research, the true initiator of the trade is frequently missing in option data sets and must be inferred using trade classification rules \autocite[][453]{easleyOptionVolumeStock1998}. In consequence, the correctness of empirical studies hinges on the algorithm's ability to accurately identify the trade initiator.
+Despite the clear importance for empirical research, the true initiator of the trade is frequently missing in option data sets and must be inferred using trade classification rules \autocite[\checkmark][453]{easleyOptionVolumeStock1998}. In consequence, the correctness of empirical studies depends on the algorithm's ability to accurately identify the trade initiator.
 
-Among the most prevailing variants to sign trades are the tick test \autocite[][240]{hasbrouckTradesQuotesInventories1988}, quote rule \autocite[][41]{harrisDayEndTransactionPrice1989}, and hybrids thereof such as the \gls{LR} algorithm \autocite[][745]{leeInferringTradeDirection1991}, the \gls{EMO} algorithm \autocite[][536]{ellisAccuracyTradeClassification2000}, and the \gls{CLNV} method \autocite[][3809]{chakrabartyTradeClassificationAlgorithms2007}, that infer the trade initiator from adjacent prices and quotes. These heuristics have initially been proposed and tested in the stock market.
+Among the most prevailing variants to sign trades are the tick test \autocite[\checkmark][240]{hasbrouckTradesQuotesInventories1988}, quote rule \autocite[\checkmark][41]{harrisDayEndTransactionPrice1989}, and hybrids thereof such as the \gls{LR} algorithm \autocite[\checkmark][745]{leeInferringTradeDirection1991}, the \gls{EMO} algorithm \autocite[\checkmark][536]{ellisAccuracyTradeClassification2000}, and the \gls{CLNV} method \autocite[\checkmark][3812]{chakrabartyTradeClassificationAlgorithms2007}, that infer the trade initiator from adjacent prices and quotes. These heuristics have initially been proposed and tested in the stock market.
 
-For option markets, the works of \textcites[][10--13]{grauerOptionTradeClassification2022}[][887]{savickasInferringDirectionOption2003} raise concerns about the transferability of standard trade signing rules due to deteriorating classification accuracies and systematic misclassifications. The latter is unsettling, as non-random misclassifications ultimately bias the dependent research \autocites[][260]{odders-whiteOccurrenceConsequencesInaccurate2000}[][157]{theissenTestAccuracyLee2001}.
+For option markets, the works of \textcites[\checkmark][11--13]{grauerOptionTradeClassification2022}[\checkmark][887--891]{savickasInferringDirectionOption2003} raise concerns about the transferability of standard trade signing rules due to deteriorating classification accuracies and systematic misclassifications. The latter is critical, as non-random misclassifications ultimately bias the dependent research \autocites[\checkmark][260]{odders-whiteOccurrenceConsequencesInaccurate2000}[\checkmark][157]{theissenTestAccuracyLee2001}.
 
-Recent work of \textcite[][13--16]{grauerOptionTradeClassification2022} made significant progress by proposing explicit overrides for trade types and by combining multiple heuristics, thereby advancing the state-of-the-art performance in option trade classification. By this means, their approach enforces a more sophisticated decision boundary eventually leading to a more accurate classification. Beyond heuristics, however, it remains open, if classifiers \emph{learned} on trade data can improve upon \emph{static} classification rules in terms of performance and robustness.
+Recent work of \textcite[\checkmark][13--16]{grauerOptionTradeClassification2022} made significant progress in classification accuracy by proposing explicit overrides for trade types and by combining multiple heuristics, thereby advancing the state-of-the-art performance in option trade classification. By this means, their approach enforces a more sophisticated decision boundary eventually leading to a more accurate classification. The fundamental constraint is, that performance improvements accumulate in small subsets of trades, due to the dedicated overrides. Beyond heuristics, it remains open, if classifiers \emph{learned} on trade data can improve upon \emph{static} classification rules in terms of performance and robustness.
 
 Our work fills this gap by focusing on machine learning methods to infer the trade initiator in the option market. Approaching trade classification with machine learning is a logical choice, given its capability to handle high-dimensional trade data and learn complex decision boundaries. This raises the question, \emph{can an alternative machine learning-based classifier improve upon the accuracy of state-of-the-art approaches for option trade classification?}
 
 To answer this question, we model trade classification through machine learning. We consider the supervised case, where fully-labeled trade data is available, as well as the semi-supervised setting, where trades are partially labeled with the true trade initiator. Our work makes the following contributions:
 \begin{enumerate}[label=(\roman*),noitemsep]
-    \item We employ state-of-the-art supervised algorithms i.~e., gradient-boosted trees and Transformer networks to the problem of trade classification and benchmark these approaches against rules-based methods. Our approaches outperform all rule-based approaches on \gls{ISE} and \gls{CBOE} data with comparable data requirements. In the application setting, our approaches approximate the true effective spread best.
-    \item In a real-world setting, labeled trades are typically scarce, while unlabeled trades are abundant. Motivated by this consideration, we extend the classifiers to learn on both labeled and unlabeled instances through pre-training and self-training procedures. We analyze the effect on classification accuracy and observe that pre-training of Transformers further alleviates accuracy on \gls{ISE} trades.
+    \item We employ state-of-the-art supervised algorithms i.~e., gradient-boosted trees and Transformer networks to the problem of trade classification and benchmark these approaches against rules-based methods. Our approaches outperform all rule-based approaches on \gls{ISE} and \gls{CBOE} data with comparable data requirements. Our smallest and medium-sized Transformers outperform the best previously reported rules from \textcite[\checkmark][13--15]{grauerOptionTradeClassification2022} by \SI{3.73}{\percent} to \SI{4.97}{\percent} in accuracy on \gls{ISE} and \SI{5.44}{\percent} to \SI{5.64}{\percent} on \gls{CBOE}. The largest Transformer with additional dependencies on option data classifies up to \SI{74.28}{\percent} (+\SI{7.76}{\percent}) correctly. For gradient-boosting improvements in accuracy range between \SI{3.62}{\percent} and \SI{4.73}{\percent} on \gls{ISE}, and between \SI{5.26}{\percent} and \SI{5.43}{\percent} on \gls{CBOE}, when trained solely on trade data. Moreover, we apply our classifiers to the problem of effective spread estimation. On \gls{CBOE} data, our models approximate the true effective spread of \SI{2.50}{\percent} best versus an estimated spread of \SI{5.70}{\percent}. On \gls{ISE} they are among the best-performing solutions.
+    \item For the semi-supervised case, we extend the classifiers to learn on both labeled and unlabeled instances through pre-training and self-training procedures. We analyze the effect on classification accuracy and observe that pre-training of Transformers further alleviates classification performance on \gls{ISE} trades with accuracies up to \SI{74.55}{\percent} (+\SI{6.94}{\percent}).
     \item Through a game-theoretic approach, our work is the first to consistently attribute the performance of rule-based and machine learning-based classification to individual features. We discover that both paradigms share common features, but machine learning-based classifiers attain higher performance gains and effectively exploit the data. By probing and visualizing the attention mechanism in the Transformer, we can strengthen the connection to rule-based classification and reveal that \emph{learned} rules mimic classical rules.
 \end{enumerate}
 
-The remainder of this thesis is organized as follows. \cref{sec:related-work} reviews publications on trade classification in option markets and using machine learning, thereby underpinning our research framework. \cref{sec:rule-based-approaches} introduces extant methods for rule-based trade classification. \cref{sec:supervised-approaches} discusses and introduces supervised methods for trade classification. Then, \cref{sec:semi-supervised-approaches} extends the previously selected algorithms for the semi-supervised case. We test the models in \cref{sec:empirical-study} in an empirical setting. In \cref{sec:application} we apply our models to the problem of effective spread estimation. Finally, \cref{sec:discussion} discusses limitations, and \cref{sec:conclusion} concludes.
+The remainder of this thesis is organized as follows. \cref{sec:related-work} reviews publications on trade classification in option markets or using machine learning, thereby underpinning our research framework. \cref{sec:rule-based-approaches} introduces methods for rule-based trade classification. \cref{sec:supervised-approaches} introduces and discusses supervised methods for trade classification. Then, \cref{sec:semi-supervised-approaches} extends the previously selected algorithms for the semi-supervised case. We test the models in \cref{sec:empirical-study} in an empirical setting and report results in \cref{sec:results}. In \cref{sec:application}, we apply our models to the problem of effective spread estimation. Finally, \cref{sec:conclusion-outlook} concludes.
diff --git a/reports/Content/makefile b/reports/Content/makefile
new file mode 100644
index 00000000..875fd49c
--- /dev/null
+++ b/reports/Content/makefile
@@ -0,0 +1,11 @@
+# Check style:
+# https://matt.might.net/articles/shell-scripts-for-passive-voice-weasel-words-duplicates/
+proof:
+	echo "weasel words: "
+	sh ../../src/otc/utils/weasel.sh *.tex
+	echo
+	echo "passive voice: "
+	sh ../../src/otc/utils/passive.sh *.tex
+	echo
+	echo "duplicates: "
+	perl ../../src/otc/utils/dups.pl *.tex
\ No newline at end of file
diff --git a/reports/Content/related-work.tex b/reports/Content/related-work.tex
index bb8ed2d9..990590a7 100644
--- a/reports/Content/related-work.tex
+++ b/reports/Content/related-work.tex
@@ -5,24 +5,24 @@ \section{Related Work}\label{sec:related-work}
 \subsection{Trade Classification in Option Markets}
 \label{sec:trade-classification-in-option-markets}
 
-While classical trade classification algorithms are extensively tested in the stock markets (e.g., \textcite[][3806--3821]{chakrabartyTradeClassificationAlgorithms2012}; \textcite[][259--286]{odders-whiteOccurrenceConsequencesInaccurate2000}), few works have examined trade classification in option markets.
+While classical trade classification algorithms are extensively tested in the stock markets (e.g., \textcite[\checkmark][3806--3821]{chakrabartyTradeClassificationAlgorithms2007}; \textcite[\checkmark][259--286]{odders-whiteOccurrenceConsequencesInaccurate2000}), few works have examined trade classification in option markets.
 
-\textcite[882--883]{savickasInferringDirectionOption2003} were the first to compare the tick rule, quote rule, the \gls{LR} algorithm and the \gls{EMO} rule for options traded at the \gls{CBOE}. The dataset spans a period from 3 July 1995 to 31 December 1995 consisting of \num{869217} matched trades. The authors report the highest accuracies for the quote rule (\SI{78.98}{\percent}) and find that all rules perform worst when applied to index options. In general, the trade classification rules exhibit significantly lower classification accuracies on options data than with stock data, urging the need for improved classifiers.
+\textcite[\checkmark][883--887]{savickasInferringDirectionOption2003} were the first to compare the tick rule, quote rule, the \gls{LR} algorithm, and the \gls{EMO} rule for options traded at the \gls{CBOE}. The dataset spans a period from 3 July 1995 to 31 December 1995 consisting of \num{869217} matched trades. The authors report the highest accuracies for the quote rule (\SI{78.98}{\percent}) and find that all rules perform worst when applied to index options. In general, the trade classification rules exhibit significantly lower classification accuracies on options data than with stock data, urging the need for improved classifiers.
 
-The most exhaustive study is the one of \textcite[1--39]{grauerOptionTradeClassification2022}. The authors test the accuracy of the classical quote rule and tick rule, and hybrids thereof on three large-scale datasets spanning periods from 2005 to 2017~\footnote{We formally define accuracy in \cref{sec:evaluation-metric}.}. Consistently for options traded at the \gls{CBOE} and \gls{ISE} classical rules like the popular \gls{LR}  algorithm only achieve accuracies of \SI{62.03}{\percent} or \SI{62.53}{\percent} and are thus significantly smaller than in the stock market. In line with the research of \textcite[886]{savickasInferringDirectionOption2003}, the reported accuracies are inversely proportional to the rule's reliance on past transaction prices. In particular, the tick rule performs worst with accuracies marginally different from a random guess. Overall, the success rates of classical algorithms deteriorate between both studies and over time. As a remedy, \textcite[14--17]{grauerOptionTradeClassification2022} introduce two additional rules based on the trade and quote sizes. The depth rule aims to classify midspread trades based on the depth at the bid or ask. Together with the trade size rule, their second rule, which classifies trades with a trade size matching the size of the bid or ask quote, the authors can substantially improve the performance of classical algorithms. Their best ensemble of rules achieves an accuracy between \SI{73}{\percent} and \SI{75}{\percent} surpassing previous approaches by more than \SI{10}{\percent}.
+The most exhaustive study is the one of \textcite[\checkmark][1--53]{grauerOptionTradeClassification2022}. The authors test the accuracy of the classical quote rule and tick rule, and hybrids thereof on three large-scale datasets spanning periods from 2005 to 2017.\footnote{We formally define accuracy in \cref{sec:evaluation-metric}.} Consistently for options traded at the \gls{CBOE} and \gls{ISE}, classical rules like the popular \gls{LR}  algorithm only achieve accuracies of \SI{62.03}{\percent} or \SI{62.53}{\percent} and are thus significantly smaller than in the stock market. In line with the research of \textcite[\checkmark][886]{savickasInferringDirectionOption2003}, the reported accuracies are inversely proportional to the rule's reliance on past transaction prices. In particular, the tick rule performs worst with accuracies marginally different from a random guess. Overall, the success rates of classical algorithms deteriorate between both studies and over time. As a remedy, \textcite[][13--15]{grauerOptionTradeClassification2022} introduce two additional rules based on the trade and quote sizes. The depth rule aims to classify midspread trades based on the depth at the bid or ask. Together with the trade size rule, their second rule, which classifies trades with a trade size matching the size of the bid or ask quote, the authors can substantially improve the performance of classical algorithms. Their best ensemble of rules achieves an accuracy between \SI{73}{\percent} and \SI{75}{\percent} surpassing previous approaches by more than \SI{10}{\percent}.
 
-The work of \textcite[1--39]{grauerOptionTradeClassification2022} is relevant to this thesis for two reasons. First, their analysis, like ours, is based on the same datasets, allowing for a fair comparison between classical rules and machine learning-based predictors. Second, their stacked combinations of the trade size rule, depth rule, and common trade classification algorithms achieve state-of-the-art performance in option trade classification and are thus a rigorous benchmark for our methods.
+The work of \textcite[\checkmark][1--53]{grauerOptionTradeClassification2022} is relevant to this thesis for two reasons. First, their analysis, like ours, is based on the same datasets, allowing for a fair comparison between classical rules and machine learning-based predictors.\footnote{We only consider \gls{CBOE} and \gls{ISE} data. The authors additionally evaluate on GEMX data in their latest update dated May 2023, which we cannot consider.} Second, their stacked combinations of the trade size rule, depth rule, and common trade classification algorithms achieve state-of-the-art performance in option trade classification and are thus a rigorous benchmark for our methods.
 
 \subsection{Trade Classification Using Machine Learning}
 \label{sec:trade-classification-using-machine-learning}
 
-\textcite[5]{rosenthalModelingTradeDirection2012} bridges the gap between classical trade classification and machine learning by fitting a logistic regression model on lagged and unlagged features innate to the tick rule, quote rule, and \gls{EMO} algorithm, as well as a sector-specific and a time-specific term. Instead of using the rule's discretized outcome as a feature, he models the rules through so-called information strength functions \autocite[6--7]{rosenthalModelingTradeDirection2012}. The proximity to the quotes, central to the \gls{EMO} algorithm, is thus modeled by a proximity function. Likewise, the information strength of the quote and tick rule is estimated as the log return between the trade price and the midpoint or the previous trade price. However, it only improves the accuracy of the \gls{EMO} algorithm by a marginal \SI{2.00}{\percent} for \gls{NASDAQ} stocks and \SI{1.10}{\percent} for \gls{NYSE} stocks \autocite[15]{rosenthalModelingTradeDirection2012}. Our work aims to improve the model by exploring non-linear estimators and minimizing data modeling assumptions.
+In the stock market, \textcite[\checkmark][396--398]{rosenthalModelingTradeDirection2012} bridges the gap between classical trade classification and machine learning by fitting a logistic regression model on lagged and unlagged features innate to the tick rule, quote rule, and \gls{EMO} algorithm, as well as a sector-specific and a time-specific term. Instead of using the rule's discretized outcome as a feature, he models the rules through so-called information strength functions \autocite[\checkmark][396--398]{rosenthalModelingTradeDirection2012}. The proximity to the quotes, central to the \gls{EMO} algorithm, is thus modeled by a proximity function. Likewise, the information strength of the quote and tick rule is estimated as the log return between the trade price and the midpoint or the previous trade price. However, it only improves the accuracy of the \gls{EMO} algorithm by a marginal \SI{2.00}{\percent} for \gls{NASDAQ} stocks and \SI{1.10}{\percent} for \gls{NYSE} stocks \autocite[\checkmark][405]{rosenthalModelingTradeDirection2012}. Our work aims to improve the model by exploring non-linear estimators and minimizing data modeling assumptions.
 
-The work of \textcite[483]{blazejewskiLocalNonParametricModel2005} compares a $k$-nearest neighbor classifier against logistic regression, as well as simple heuristics like the majority vote over past trades for signing trades at the Australian stock exchange. Their results indicate that the parametric $k$-nearest neighbor classifier improves upon a linear logistic regression in terms of classification accuracy, even when trained on fewer features. The work is unique from the remaining works about the feature set definition. Notably, \textcite[483]{blazejewskiLocalNonParametricModel2005} use no quote or trade prices, but rather the order book volumes, trade sizes, and past trade signs for classification. No accuracies for classical trade signing rules are reported, which impedes a comparison across different works. In line with their results, we focus on non-linear models. Additionally, our paper addresses the mentioned shortcomings by benchmarking against state-of-the-art trade classification rules. We share the idea of using the trade size, as well as the bid and ask sizes for classification for some of our feature sets, but greedily predict using non-historic features.
+The work of \textcite[\checkmark][483]{blazejewskiLocalNonParametricModel2005} compares a $k$-nearest neighbor classifier against logistic regression, as well as simple heuristics like the majority vote over past trades for signing trades at the Australian stock exchange. Their results indicate that the parametric $k$-nearest neighbor classifier improves upon a linear logistic regression in terms of classification accuracy, even when trained on fewer features. The work is unique from the remaining works about the feature set definition. Notably, \textcite[\checkmark][483]{blazejewskiLocalNonParametricModel2005} use no quote or trade prices, but rather the order book volumes, trade sizes, and past trade signs for classification. No accuracies for classical trade signing rules are reported, which impedes a comparison across different works. In line with their results, we focus on non-linear models. Additionally, our paper addresses the mentioned shortcomings by benchmarking against state-of-the-art trade classification rules. We share the idea of using the trade size, as well as the bid and ask sizes for classification for some of our feature sets, but greedily predict using non-historic features.
 
-Closest to our work is a publication by \textcite[1--58]{ronenMachineLearningTrade2022}. Therein, the authors compare a selection of machine learning algorithms against classical trade signing rules in the bond and stock market. Their comparison is the first to consider logistic regression, a random forest, as well as \glspl{feed-forward-network}. Over a wide range of feature sets the tree-based ensemble consistently outperforms by out-of-sample accuracy the tick rule and \gls{LR} algorithm, as well as all remaining machine learning models. For the TRACE and \gls{NASDAQ} dataset, their best variant of the random forest outperforms the tick rule by \SI{8.30}{\percent} and \SI{3.30}{\percent}, respectively \autocite[57]{ronenMachineLearningTrade2022}. Whilst the superiority of random forests is consistent for the bond and equity market, fitted classifiers do not transfer across markets, as accuracies diminish in a transfer setting.
+Closest to our work is a publication by \textcite[\checkmark][1--58]{ronenMachineLearningTrade2022}. Therein, the authors compare a selection of machine learning algorithms against classical trade signing rules in the bond and stock market. Their comparison is the first to consider logistic regression, a random forest, as well as \glspl{feed-forward-network}. Over a wide range of feature sets the tree-based ensemble consistently outperforms by out-of-sample accuracy the tick rule and \gls{LR} algorithm, as well as all remaining machine learning models. For the TRACE and \gls{NASDAQ} dataset, their best variant of the random forest outperforms the tick rule by \SI{8.30}{\percent} and \SI{3.30}{\percent}, respectively \autocite[\checkmark][57]{ronenMachineLearningTrade2022}. Whilst the superiority of random forests is consistent for the bond and equity market, fitted classifiers do not transfer across markets, as accuracies diminish in a transfer setting.
 
-The results convincingly demonstrate the potential of machine learning, i.e., of tree-based ensembles, for trade classification. Yet, the comparability of the results is limited by the classifier's reliance on additional features beyond quote and price data. Albeit, \textcite[13--14]{ronenMachineLearningTrade2022} consider a wide range of approaches, their selection leaves the latest advancements in artificial neural networks and ensemble learning aside and is mainly guided by computational constraints. Even if the focus is on standard techniques, the unclear research agenda concerning model selection, tuning, and testing hampers the transferability of their results to the yet unstudied option market.
+The results convincingly demonstrate the potential of machine learning, i.e., of tree-based ensembles, for trade classification. Yet, the comparability of the results is limited by the classifier's reliance on additional features beyond quote and price data. Albeit, \textcite[\checkmark][13--14]{ronenMachineLearningTrade2022} consider a wide range of approaches, their selection leaves the latest advancements in artificial neural networks and ensemble learning aside and is mainly guided by computational constraints. Even if the focus is on standard techniques, the unclear research agenda concerning model selection, tuning, and testing hampers the transferability of their results to the yet unstudied option market.
 
 In summary, machine learning has been applied successfully in the context of trade classification. A summary is given in \cref{app:literature-ml-tc}. No previous work performs machine learning-based classification in the options markets. Our work fills this gap and models trade classification using machine learning to improve upon extant rules.
 
@@ -30,7 +30,7 @@ \subsection{Research Framework}\label{sec:research-framework}
 
 The selection of machine learning methods in previous works is arbitrary and guided by computational constraints. Additionally, it leaves out advancements in machine learning. To address these limitations, we propose a comprehensive research framework for trade classification, outlined in \cref{fig:research-framework}. 
 
-Our approach revolves around two key ideas. First, we leverage \glspl{GBRT} and Transformers for trade classification. These methods are selected in \cref{sec:supervised-approaches} for their expected performance, scalability, and extensibility and later enhanced to learn on partially-labeled trades. Second, classical trade classification rules, such as the \gls{LR}, are realized as a rule-based classifier using a stacking principle describe in \cref{sec:stacked-rule}. This allows for a consistent evaluation and model interpretation, eventually bridging the gap between classical trade classification rules and machine learning.
+Our approach revolves around two key ideas. First, we leverage \glspl{GBRT} and Transformers for trade classification. These methods are selected in \cref{sec:supervised-approaches} for their expected performance, scalability, and extensibility and later enhanced to learn on partially-labeled trades. Second, classical trade classification rules, such as the \gls{LR}, are realized as a rule-based classifier using a stacking principle described in \cref{sec:stacked-rule}. This allows for a consistent evaluation and model interpretation, eventually bridging the gap between classical trade classification rules and machine learning.
 
 \begin{figure}[!ht]
     \centering
@@ -44,5 +44,3 @@ \subsection{Research Framework}\label{sec:research-framework}
 To assess the efficiency of our approach, we conduct a comprehensive empirical study. In preparation, the data preparation process, outlined in \cref{sec:data-and-data-preparation}, encompasses all steps necessary to obtain features to be processed by the classifiers. Model enhancements, training setups, and tuning procedures are detailed in \cref{sec:training-and-tuning}. 
 
 The predictions of the classifiers are consistently evaluated in terms of accuracy as part of \cref{sec:evaluation}. With the model-agnostic interpretability method \gls{SAGE}, we study the reliance of the model on features and cross-compare the feature importances of classical trade classification rules and machine learning predictors. In turn, attention maps from Transformers yield model-specific insights.
-
-Finally, \cref{sec:application} tests all classifiers in the problem of effective spread calculation to demonstrate the effectiveness of our approach.
diff --git a/reports/Content/results.tex b/reports/Content/results.tex
index d5570b2b..10a0ea3e 100644
--- a/reports/Content/results.tex
+++ b/reports/Content/results.tex
@@ -1,20 +1,22 @@
-\section{Results}\label{sec:results}
+\section{Results and Discussion}\label{sec:results}
 
 This chapter compares the performance of rule-based trade classification against machine learning-based classification. 
 
-\subsection{Results of Rule-Based Approaches}\label{sec:result-of-rule-based-approaches}
+\subsection{Performance}\label{sec:performance-results-discussion}
 
-We estimate the accuracy of classical trade classification rules on the \gls{ISE} and \gls{CBOE} samples. We consider the tick and quote rule, as well as the \gls{LR} algorithm, \gls{EMO} rule and \gls{CLNV} method in their classical and reversed formulation. Additionally, we consider the \gls{GSU} method (small) and \gls{GSU} method (large) due to their state-of-the-art performance on the validation set, as derived in \cref{sec:hyperparameter-tuning}.
+\subsubsection{Performance of Rule-Based Approaches}\label{sec:result-of-rule-based-approaches}
 
-We report in \cref{tab:ise-classical} accuracies for the entire data set and separate subsets spanning the periods of train, validation, and test set as defined in \cref{sec:train-test-split}. Doing so enables comparisons with previous works, but also provides meaningful estimates on the test set relevant for benchmarking purposes. Our results are approximately similar to \textcite[][29--33]{grauerOptionTradeClassification2022}. Minor deviations exist, which can be pinned down to differences in handling of unclassified trades and non-positive spreads, as well as divergent implementations of the depth rule.\footnote{Correspondence with the author.}
+We estimate the accuracy of classical trade classification rules on the \gls{ISE} and \gls{CBOE} samples. We consider the tick and quote rule, as well as the \gls{LR} algorithm, \gls{EMO} rule, and \gls{CLNV} method in their classical and reversed formulation. Additionally, we report the \gls{GSU} method (small) and \gls{GSU} method (large) due to their state-of-the-art performance on the validation set, as derived in \cref{sec:hyperparameter-tuning}.
 
-From all rules, the tick rule performs worst when applied to trade prices at the trading venue with accuracies of a random guess, \SI{49.67}{\percent}. For comparison, a simple majority vote achieves \SI{51.40}{\percent} accuracy. The tick test performs best when estimated on the consecutive trade prices, and additionally, when estimated at the inter-exchange level marginally improves over a random classification, achieving accuracies of \SI{55.25}{\percent} for the reversed tick test. Due to the poor performance, of tick-based algorithms at the exchange level, we estimate all hybrids with last/next differing price from any exchange, dubbed $\operatorname{tick}_{\mathrm{all}}$ or $\operatorname{rtick}_{\mathrm{all}}$.
+\cref{tab:ise-classical} reports the accuracies for the entire data set and for separate subsets spanning the periods of the train, validation, and test set as defined in \cref{sec:train-test-split}. Doing so enables comparisons with previous works, but also provides meaningful estimates on the test set relevant for benchmarking purposes. Our results are approximately similar to \textcite[\checkmark][40--42]{grauerOptionTradeClassification2022}. Minor deviations exist, which can be pinned down to differences in handling of unclassified trades and non-positive spreads, as well as divergent implementations of the depth rule.\footnote{Correspondence with the author.}
 
-Quote-based algorithms outperform tick-based algorithms delivering accuracy up to \SI{63.72}{\percent} when estimated on the \gls{NBBO}. The superiority of quote-based algorithms in option trade classification has previously been documented in \textcites[][891]{savickasInferringDirectionOption2003}[][3]{grauerOptionTradeClassification2022}.
+From all rules, the tick rule performs worst when applied to trade prices at the trading venue with accuracies of a random guess, \SI{49.67}{\percent}. For comparison, a na\"ive majority vote achieves \SI{51.40}{\percent} accuracy. The tick test performs best when estimated on consecutive trade prices. Additionally, when estimated at the inter-exchange level marginally improves over a random classification, achieving accuracies of \SI{55.25}{\percent} for the reversed tick test. Due to the poor performance of tick-based algorithms at the exchange level, we estimate all hybrids with the last/next differing price from any exchange, distinguished by $\operatorname{tick}_{\mathrm{all}}$ or $\operatorname{rtick}_{\mathrm{all}}$.
+
+Quote-based algorithms outperform tick-based algorithms delivering accuracy up to \SI{63.72}{\percent} when estimated on the \gls{NBBO}. The superiority of quote-based algorithms in option trade classification has previously been documented in \textcites[\checkmark][891]{savickasInferringDirectionOption2003}[\checkmark][3]{grauerOptionTradeClassification2022}.
 
 \begin{table}[ht]
     \centering
-    \caption[Accuracies of Rule-Based Approaches on \glsentryshort{ISE}]{This table shows the accuracy of common trade classification rules and their variations for option trades on \gls{ISE} sample. Unclassifiable trades by the respective rule are assigned randomly as buy or sell. Hybrid methods are estimated using trade prices across all exchanges. We report the percentage of classifiable trades and the overall accuracy for subsets based on our train-test split and the entire dataset. The best rule is in bold.}
+    \caption[Accuracies of Rule-Based Approaches on \glsentryshort{ISE}]{Accuracy of common trade classification rules and their variations for option trades on \gls{ISE} sample. Unclassifiable trades by the respective rule are assigned randomly as buy or sell. Hybrid methods are estimated using trade prices across all exchanges. We report the percentage of classifiable trades and the overall accuracy for subsets based on our train-test split and the entire dataset. The best rule is in bold.}
     \label{tab:ise-classical}
     \begin{tabular}{@{}lSSSSS@{}}
         \toprule
@@ -46,7 +48,7 @@ \subsection{Results of Rule-Based Approaches}\label{sec:result-of-rule-based-app
 
 The performance of hybrids, such as the \gls{LR} algorithm, hinges on the reliance on the tick test. Thus, the \gls{EMO} rules and to a lesser extent the \gls{CLNV} rules perform worst, achieving accuracies between \SI{55.42}{\percent} and \SI{57.57}{\percent}. In turn, variants of the \gls{LR}, which uses the quote rule for most trades, are among the best-performing algorithms. By extension, \gls{GSU} method (small) further reduces the dependence on tick-based methods through the successive applications of quote rules, here $\operatorname{quote}_{\mathrm{nbbo}} \to \operatorname{quote}_{\mathrm{ex}}$.
 
-Notably, the \gls{GSU} method (large) featuring overrides from the trade size and depth rules performs best, achieving \SI{67.61}{\percent} accuracy on the \gls{ISE} test set and \SI{75.49}{\percent} on the entire dataset. Yet, the performance deteriorates most sharply between sets, as visualized in \cref{fig:classical-accuracies-over-time}.
+Notably, the \gls{GSU} method (large) featuring overrides from the trade size and depth rules performs best, achieving \SI{67.61}{\percent} accuracy on the test set and \SI{75.49}{\percent} on the entire dataset. Yet, the performance deteriorates most sharply between sets, as visualized in \cref{fig:classical-accuracies-over-time}.
 
 \begin{figure}[ht]
     \centering
@@ -57,7 +59,7 @@ \subsection{Results of Rule-Based Approaches}\label{sec:result-of-rule-based-app
 
 \begin{table}[ht]
     \centering
-    \caption[Accuracies of Rule-Based Approaches on \glsentryshort{CBOE}]{This table shows the accuracy of common trade classification rules and their variations for option trades on \gls{CBOE} sample. Unclassifiable trades by the respective rule are assigned randomly as buy or sell. Hybrid methods are estimated using trade prices across all exchanges. We report the percentage of classifiable trades and the overall accuracy for subsets based on our train-test split and the entire dataset. The best rule is in bold.}
+    \caption[Accuracies of Rule-Based Approaches on \glsentryshort{CBOE}]{Accuracy of common trade classification rules and their variations for option trades on \gls{CBOE} sample. Unclassifiable trades by the respective rule are assigned randomly as buy or sell. Hybrid methods are estimated using trade prices across all exchanges. We report the percentage of classifiable trades and the overall accuracy for subsets based on our train-test split and the entire dataset. The best rule is in bold.}
     \label{tab:cboe-classical}
     \begin{tabular}{lSSSS}
         \toprule
@@ -86,7 +88,7 @@ \subsection{Results of Rule-Based Approaches}\label{sec:result-of-rule-based-app
     \end{tabular}
 \end{table}
 
-We repeat the analysis on the \gls{CBOE} dataset in \cref{tab:cboe-classical} and observe a similar ranking to \cref{tab:ise-classical}. Overall, the performance of classical trade classification rules further diminishes or remains at a low level. Tick-based rules trail the performance of quote-based approaches, and the accuracy of hybrids varies with the dependence on the tick test. Different from the \gls{ISE} sample, the quote rule estimated on the \gls{NBBO}, leads to a degraded performance than the quote rule applied to \gls{CBOE} quotes. Again, \gls{GSU} method (small) and \gls{GSU} method (large) perform best, though the strong outperformance does not carry over to the test set as depicted \cref{fig:classical-accuracies-over-time}.\footnote{Performance on \gls{CBOE} can be improved if the order of quote rules is reversed. For full combinatoric coverage see \textcite[][33]{grauerOptionTradeClassification2022}. To avoid overfitting the test set by classical rules, we keep the baseline constant following our reasoning from \cref{sec:hyperparameter-tuning}.}
+We repeat the analysis on the \gls{CBOE} dataset in \cref{tab:cboe-classical} and observe a similar ranking to \cref{tab:ise-classical}. Overall, the performance of classical trade classification rules further diminishes or remains at a low level. Tick-based rules trail the performance of quote-based approaches, and the accuracy of hybrids varies with the dependence on the tick test. None of the rules reaches the classification performance in \textcite[\checkmark][886]{savickasInferringDirectionOption2003} on an earlier subset of \gls{CBOE}. Different from the \gls{ISE} sample, the quote rule estimated on the \gls{NBBO} leads to a degraded performance than the quote rule applied to \gls{CBOE} quotes. Again, \gls{GSU} method (small) and \gls{GSU} method (large) perform best, though the strong outperformance does not carry over to the test set as depicted \cref{fig:classical-accuracies-over-time}.\footnote{Performance on \gls{CBOE} can be improved if the order of quote rules is reversed. For full combinatoric coverage see \textcite[\checkmark][44]{grauerOptionTradeClassification2022}. To avoid overfitting the test set by classical rules, we keep the baseline constant following our reasoning from \cref{sec:hyperparameter-tuning}.}
 
 \begin{figure}[!ht]
     \centering
@@ -95,29 +97,30 @@ \subsection{Results of Rule-Based Approaches}\label{sec:result-of-rule-based-app
     \label{fig:classical-coverage-over-time}
 \end{figure}
 
-From \cref{tab:ise-classical,tab:cboe-classical} we see, that practically all rule-based approaches leave trades unclassified. This is due to conceptual constraints in the rule itself, but also a result of missing or corrupted data, which equally affects rules with theoretical full coverage. As visualized in \cref{fig:classical-coverage-over-time} coverage decreases qualitatively for selected classification rules over time. It is particularly low when the trade initiator is inferred from the \gls{NBBO}. Theoretically, the tick test can achieve full coverage, in our sample it classifies only $\approx$ \SI{91.5}{\percent}, which is significantly lower than coverage rates reported in the stock market \autocite[][535]{ellisAccuracyTradeClassification2000}. The low, fluctuating coverage stems from the absence of a distinguishable trade price. For the quote rule, we isolate missing or inverted quotes from midspread trades. Through comparison between \cref{fig:classical-coverage-over-time} and \cref{fig:classical-at-mid-over-time} it is evident, that the majority of unclassified trades are midspread trades, whose share increases over time. In our datasets, hybrids, have the advantage of leveraging multiple data sources, resulting in more complete coverage. If, as in the combinations of \textcite[][18--19]{grauerOptionTradeClassification2022}, the basic rules are strong individually, higher coverage is associated with better performance, as fewer trades are classified by a fallback mechanism.
+From \cref{tab:ise-classical,tab:cboe-classical} we see, that practically all rule-based approaches leave trades unclassified. This is due to conceptual constraints in the rule itself, but also a result of missing or corrupted data, which equally affects rules with theoretical full coverage. As visualized in \cref{fig:classical-coverage-over-time} coverage decreases qualitatively for selected classification rules over time. It is particularly low when the trade initiator is inferred from the \gls{NBBO}. Theoretically, the tick test can achieve full coverage, in our sample it classifies only $\approx$ \SI{91.5}{\percent}, which is significantly lower than coverage rates reported in the stock market \autocite[\checkmark][535]{ellisAccuracyTradeClassification2000} and option market \autocite[\checkmark][886]{savickasInferringDirectionOption2003} (variant (no reset)). The low, fluctuating coverage stems from the absence of a distinguishable trade price. For the quote rule, we isolate missing or crossed quotes from midspread trades. 
+
+Through comparison between \cref{fig:classical-coverage-over-time} and \cref{fig:classical-at-mid-over-time} it is evident, that the majority of unclassified trades are midspread trades, whose share increases over time. The outliers in \cref{fig:classical-coverage-over-time} are due to missing quotes.
+Next, we test the performance of supervised classifiers on the test sets.
 
 \begin{figure}[!ht]
     \centering
     \includegraphics{classical_at_mid_over_time.pdf}
-    \caption[Mid-Spread Trades Over Time]{Percentage of midspread trades on \gls{ISE} and \gls{CBOE} sample over time. Estimated using \gls{NBBO} quotes. The bar \myline{} indicates the beginning of a new subset based on the train-test split.}
+    \caption[Mid-Spread Trades Over Time]{Percentage of midspread trades on \gls{ISE} and \gls{CBOE} sample over time and estimated using \gls{NBBO} quotes. The bar \myline{} indicates the beginning of a new subset based on the train-test split.}
     \label{fig:classical-at-mid-over-time}
 \end{figure}
 
-Our machine learning classifiers are robust to missing data, as they can learn alternate patterns for missing features. Next, we test the supervised classifiers on the \gls{ISE}/\gls{CBOE} test sets, which prove to be a challenging test ground for rule-based classifiers as our results from above indicate.
-
-\subsection{Results of Supervised
+\subsubsection{Performance of Supervised
     Models}\label{sec:results-of-supervised-models}
 
-We test the performance of our supervised models. We take the best configurations from \cref{sec:hyperparameter-tuning}, trained and tuned on the \gls{ISE} trade data, and evaluate their performance on the \gls{ISE} and \gls{CBOE} test sets. \cref{tab:results-supervised-ise-cboe} summarizes the results and benchmarks against state-of-the-art solutions from the literature.
+We take the best configurations from \cref{sec:hyperparameter-tuning}, trained and tuned on the \gls{ISE} trade data, and evaluate their performance on the \gls{ISE} and \gls{CBOE} test sets. \cref{tab:results-supervised-ise-cboe} summarizes the results and benchmarks against state-of-the-art solutions from the literature.
 
 \begin{table}[ht]
     \centering
-    \caption[Accuracies of Supervised Approaches]{This table reports the accuracy of supervised \glspl{GBRT} and Transformers for different feature combinations on the \gls{ISE} and \gls{CBOE} datasets. The improvement is estimated as the absolute change in accuracy between the classifier and the benchmark. For feature set classical, $\operatorname{gsu}_{\mathrm{small}}$ is the benchmark and otherwise $\operatorname{gsu}_{\mathrm{large}}$. Models are trained on the \gls{ISE} training set. The best classifier per dataset is in bold.}
+    \caption[Accuracies of Supervised Approaches]{Accuracy of supervised \glspl{GBRT} and Transformers for different feature combinations on the \gls{ISE} and \gls{CBOE} datasets. The improvement is estimated as the absolute change in accuracy between the classifier and the benchmark. For the feature set classic, \gls{GSU} (small) is the benchmark, and otherwise \gls{GSU} (large). Models are trained on the \gls{ISE} training set. The best classifier per dataset is in bold.}
     \label{tab:results-supervised-ise-cboe}
     \begin{tabular}{@{}llSSSSSS@{}}
         \toprule
-                   &             & \multicolumn{2}{c}{\glsentryshort{FS} Classical} & \multicolumn{2}{c}{\glsentryshort{FS} Size} & \multicolumn{2}{c}{\glsentryshort{FS} Option}                                                                 \\ \cmidrule(lr){3-4}\cmidrule(lr){5-6} \cmidrule(lr){7-8}
+                   &             & \multicolumn{2}{c}{\glsentryshort{FS} Classic} & \multicolumn{2}{c}{\glsentryshort{FS} Size} & \multicolumn{2}{c}{\glsentryshort{FS} Option}                                                                 \\ \cmidrule(lr){3-4}\cmidrule(lr){5-6} \cmidrule(lr){7-8}
         Dataset    & Classifier  & {Acc. in \%}                                     & {+/-}                                                 & {Acc. in \%}                                  & {+/-}              & {Acc. in \%}        & {+/-}              \\ \midrule
         \gls{ISE}  & \gls{GBRT}  & 63.668637                                        & 3.620000                                              & 72.343640                                     & 4.730000           & \bfseries 74.120496 & \bfseries 6.510000 \\
                    & Transformer & \bfseries 63.783020                              & \bfseries 3.730000                                    & \bfseries 72.581107                           & \bfseries 4.970000 & 73.921795           & 6.310000           \\ \addlinespace
@@ -126,25 +129,23 @@ \subsection{Results of Supervised
     \end{tabular}
 \end{table}
 
-Both model architectures consistently outperform their respective benchmarks on the \gls{ISE} and \gls{CBOE} datasets, establishing a state-of-the-art in option trade classification with comparable data requirements. Thereby, Transformers dominate the \gls{ISE} sample when trained on trade prices and quotes reaching \SI{63.783020}{\percent}  in accuracy and \SI{66.18}{\percent} on the \gls{CBOE} sample outperforming previous approaches by \SI{3.730000}{\percent} and \SI{5.440000}{\percent}. Additional trade size features improve the accuracy to \SI{72.581107}{\percent} for the \gls{ISE} sample and \SI{72.153338}{\percent} for the \gls{CBOE} sample. Gradient boosting outperforms all other approaches when trained on additional option features.
-
-While absolute improvements in accuracy over $\operatorname{gsu}_{\mathrm{small}}$ are modest on the smallest feature set, improvements are substantial for larger feature sets ranging between \SI{4.730000}{\percent} to \SI{7.860000}{\percent} over $\operatorname{gsu}_{\mathrm{large}}$. Specifically, the addition of trade size-related features positively contributes to the performance. We discuss feature importances in \cref{sec:feature-importance}.
+Both model architectures consistently outperform their respective benchmarks on the \gls{ISE} and \gls{CBOE} datasets, establishing a state-of-the-art in option trade classification with comparable data requirements. Thereby, Transformers dominate the \gls{ISE} sample when trained on trade prices and quotes reaching \SI{63.783020}{\percent}  in accuracy and \SI{66.18}{\percent} on the \gls{CBOE} sample outperforming previous approaches by \SI{3.730000}{\percent} and \SI{5.440000}{\percent}. Additional trade size features have the strongest impact on performance and improve the accuracy to \SI{72.581107}{\percent} for the \gls{ISE} sample and \SI{72.153338}{\percent} for the \gls{CBOE} sample. Gradient boosting outperforms all other approaches when trained on additional option features.
 
 The results can be enhanced through re-training on the validation set improving accuracies to \SI{76.162269}{\percent}, as documented in \cref{app:results-of-supervised-models-with-re-training}. In favor of conservative estimates, our models in the main text do not use this technique.
 
-To formally test, whether differences between both classifiers are significant, we construct contingency tables and pair-wise compare predictions using McNemar's test \autocite[][153--157]{mcnemarNoteSamplingError1947}. We formulate the null hypothesis that both classifiers have the same error rate.
-Conceptually similar \textcite[][267]{odders-whiteOccurrenceConsequencesInaccurate2000}, uses contingency tables of rule-based methods and true labels. Here, contingency tables are used to pair-wise compare the predictions of \glspl{GBRT} against Transformers.
+Visually, the performance differences between gradient boosting and Transformers on the same feature sets are minor. To formally test, whether differences between both classifiers are significant, we construct contingency tables and pair-wise compare predictions using McNemar's test \autocite[\checkmark][153--157]{mcnemarNoteSamplingError1947}. We formulate the null hypothesis that both classifiers have the same error rate.
+Similar to \textcite[\checkmark][267]{odders-whiteOccurrenceConsequencesInaccurate2000}, uses contingency tables of rule-based methods and true labels. Here, contingency tables are used to pair-wise compare the predictions of \glspl{GBRT} against Transformers.
 
 \begin{table}[!ht]
     \centering
     \sisetup{table-number-alignment=right, table-format=7.0}
-    \caption[Contingency Tables of Supervised Classifiers]{This table contains the contingency tables of the supervised classifiers on the \gls{CBOE} and \gls{ISE} test set for feature set classical, classical-size, and option. Cells sum the number of trades, correctly/falsely classified by both classifiers or one. Additionally, McNemar's test statistic $\chi^2$ and the associated $p$-value are reported.}
+    \caption[Contingency Tables of Supervised Classifiers]{Contingency tables of the supervised classifiers on the \gls{CBOE} and \gls{ISE} test set for feature set classic, size, and option. Cells sum the number of trades, correctly/falsely classified by both classifiers or one. Additionally, McNemar's test statistic $\chi^2$ and the associated $p$-value are reported.}
     \label{tab:contigency-supervised-classifiers}
     \begin{tabular}{@{}llSSSSSS@{}}
         \toprule
-                                                                          &           & \multicolumn{2}{c}{{\glsentryshort{FS} Classical}}          & \multicolumn{2}{c}{{\glsentryshort{FS} Size}}      & \multicolumn{2}{c}{{\glsentryshort{FS} Option}}                                             \\
+                                                                          &           & \multicolumn{2}{c}{{\glsentryshort{FS} Classic}}          & \multicolumn{2}{c}{{\glsentryshort{FS} Size}}      & \multicolumn{2}{c}{{\glsentryshort{FS} Option}}                                             \\
         \cmidrule(l){3-4}\cmidrule(l){5-6}\cmidrule(l){7-8}
-        \multicolumn{2}{l}{{$\downarrow$ Trans.$\rightarrow$ \gls{GBRT}}} & {Correct} & {Wrong}                                                     & {Correct}                                                    & {Wrong}                                                     & {Correct} & {Wrong}           \\
+        \multicolumn{2}{l}{{$\downarrow$  \gls{GBRT} $\rightarrow$ Trans.}} & {Correct} & {Wrong}                                                     & {Correct}                                                    & {Wrong}                                                     & {Correct} & {Wrong}           \\
         \midrule
         \gls{ISE}                                                         & Correct   & 5904530                                                     & 374201                                                       & 6790958                                                     & 343265    & 6722730 & 586719  \\
                                                                           & Wrong     & 385481                                                      & 3197364                                                      & 366683                                                      & 2360670   & 567124  & 1985003 \\         \addlinespace
@@ -159,33 +160,22 @@ \subsection{Results of Supervised
     \end{tabular}
 \end{table}
 
-Based on the contingency tables in \cref{tab:contigency-supervised-classifiers}, we observe that both models share a large portion of trades, for which both classifiers agree.\footnote{Through summation of correct classifications of one classifier divided by the matrix sum, one obtains the accuracy from \cref{tab:results-supervised-ise-cboe}. Consider the first entry, e.g., $(\num{5904530}+\num{374201}) / (\num{5904530} + \num{374201} + \num{385481} + \num{3197364}) \approx \num{0.63668637}$.} For larger feature sets, the share of trades correctly classified by one classifier grows, while the number of jointly correctly classified trades plateaus. This can be an indication, that both models learn specific patterns and excel in different trades. The performance differences between classifiers are statistically significant at the \SI{1}{\percent}. The null hypothesis can be rejected.
-
-Relative to related works performing trade classification with machine learning, the improvements are strong, as documented in \cref{app:literature-ml-tc}. As no other work studies the option market or identical model architectures, the results are indicative. The studies report improvements between \SI{1.1}{\percent} and \SI{13.3}{\percent} for their machine learning models over the benchmark. Our absolute improvements exceed all linear models, but the absolute improvements are smaller relative to some tree-based and deep learning models in \textcite[][49]{ronenMachineLearningTrade2022}. At the same time, our models are trained on significantly fewer features and on a static training set requiring a fraction of the training cost. We believe, our conservative framing aligns well with scenarios, where trade classification is only a prerequisite to other empirical research.
+Based on the contingency tables in \cref{tab:contigency-supervised-classifiers}, we see that both models share a large portion of trades, for which both classifiers agree.\footnote{Through summation of correct classifications of one classifier divided by the matrix sum, one obtains the accuracy from \cref{tab:results-supervised-ise-cboe}. Consider the first entry, e.g., $(\num{5904530}+\num{374201}) / (\num{5904530} + \num{374201} + \num{385481} + \num{3197364}) \approx \num{0.63668637}$.} For larger feature sets, the share of trades correctly classified by one classifier grows, while the number of jointly correctly classified trades plateaus. Thus, both models learn specific patterns and excel in different trades. The performance differences between classifiers are statistically significant at the \SI{1}{\percent} level. The null hypothesis can be rejected.
 
-Visually, the performance differences between gradient boosting and Transformers on the same feature sets are minor, which is in accordance to \textcites{grinsztajnWhyTreebasedModels2022}{gorishniyRevisitingDeepLearning2021}. These studies conclude, generally for tabular modeling, that neither Transformers nor \glspl{GBRT} are universally superior. Our results validate this observation, specifically for trade classification.
-% \todo{It is conceivable, that ...}
+In summary, our supervised methods establish a new state-of-the-art in option trade classification. Our approach achieves full coverage and outperforms all previously reported classification rules in terms of accuracy. Performance transfers across exchanges. We perform additional robustness checks in \cref{sec:robustness-results} to identify any systematic misclassification.
 
-% Our findings thereby contradict those of \textcite[][14--49]{ronenMachineLearningTrade2022}, who benchmark tree-based ensembles in the form of random forests and neural networks in the form of \gls{FFN} for trade classification in the equity and bond market and find clear dominance of the tree-based approach. Beyond differences in the market under study and variants, two methodological differences are evident, that explain the diverging results. First, unlike \gls{FFN}, the FT-Transformer is tailored to learn on tabular data through being a rotationally-invariant learner. Second, the data pre-processing and feature engineering is tailored to the requirements of neural networks. Without these measures, tree-based approaches excel due to their robustness in handling skewed and missing data.
-
-Despite the lack of adaption to \gls{CBOE} data, the performance improvements are highest for the \gls{CBOE} dataset. This result is in stark contrast to the of \textcite[][32]{ronenMachineLearningTrade2022}, who test random forests for trade classification and report subpar performance. Their setting differs from ours, as they apply ensembles trained in the bond market to equity trades. Moreover, it is unclear if data preprocessing procedures are shared between both sets, which may hamper performance. 
-
-Part of the strong performance on \gls{CBOE} trades hails from weaker benchmark performance, but also from a stronger accuracy of classifiers on the smallest and mid-sized feature sets. One would expect a degradation between sets, assuming exchange-specific trading patterns.
-
-In summary, our supervised methods establish a new state-of-the-art in option trade classification. Our approach achieves full coverage and outperforms all previously reported classification rules in terms of accuracy. Performance transfers across exchanges. We perform additional robustness checks in \cref{sec:robustness-checks} to identify any systematic misclassification.
-
-\subsection{Results of Semi-supervised
+\subsubsection{Performance of Semi-supervised
     Models}\label{sec:results-of-semi-supervised-models}
 
 We compare the performance of pre-trained Transformers and self-trained gradient-boosting on the \gls{ISE} and \gls{CBOE} test set. Results are reported in \cref{tab:results-semi-supervised-ise-cboe}.
 
 \begin{table}[ht]
     \centering
-    \caption[Accuracies of Semi-Supervised Approaches]{This table reports the accuracy of semi-supervised \glspl{GBRT} and Transformers for different feature combinations on the \gls{ISE} and \gls{CBOE} datasets. The improvement is estimated as the absolute change in accuracy between the classifier and the benchmark. For feature set classical, $\operatorname{gsu}_{\mathrm{small}}$ is the benchmark and otherwise $\operatorname{gsu}_{\mathrm{large}}$. Models are trained on the \gls{ISE} training set. The best classifier per dataset is in bold.}
+    \caption[Accuracies of Semi-Supervised Approaches]{Accuracy of semi-supervised \glspl{GBRT} and Transformers for different feature combinations on the \gls{ISE} and \gls{CBOE} datasets. The improvement is estimated as the absolute change in accuracy between the classifier and the benchmark. For feature set classic, $\operatorname{gsu}_{\mathrm{small}}$ is the benchmark and otherwise $\operatorname{gsu}_{\mathrm{large}}$. Models are trained on the \gls{ISE} training set. The best classifier per dataset is in bold.}
     \label{tab:results-semi-supervised-ise-cboe}
     \begin{tabular}{@{}llSSSSSS@{}}
         \toprule
-                   &             & \multicolumn{2}{c}{\glsentryshort{FS} Classical} & \multicolumn{2}{c}{\glsentryshort{FS} Size} & \multicolumn{2}{c}{\glsentryshort{FS} Option}                                                                 \\ \cmidrule(lr){3-4}\cmidrule(lr){5-6} \cmidrule(lr){7-8}
+                   &             & \multicolumn{2}{c}{\glsentryshort{FS} Classic} & \multicolumn{2}{c}{\glsentryshort{FS} Size} & \multicolumn{2}{c}{\glsentryshort{FS} Option}                                                                 \\ \cmidrule(lr){3-4}\cmidrule(lr){5-6} \cmidrule(lr){7-8}
         Dataset    & Classifier  & {Acc. in \%}                                     & {+/-}                                                 & {Acc. in \%}                                  & {+/-}              & {Acc. in \%}        & {+/-}              \\ \midrule
         \gls{ISE}  & \gls{GBRT}  & 63.397514                                        & 3.350000                                              & 72.156489                                     & 4.550000           & 73.536644           & 5.930000           \\
                    & Transformer & \bfseries 64.655751                              & \bfseries 4.600000                                    & \bfseries 72.859054                           & \bfseries 5.250000 & \bfseries 74.551410 & \bfseries 6.940000 \\ \addlinespace
@@ -194,20 +184,20 @@ \subsection{Results of Semi-supervised
     \end{tabular}
 \end{table}
 
-Identical to the supervised case, our models consistently outperform their respective benchmarks. Gradient boosting with self-training surpasses $\operatorname{gsu}_{\mathrm{small}}$ by \SI{3.350000}{\percent} on \gls{ISE} and \SI{5.440000}{\percent} on \gls{CBOE} in accuracy. Improvements for larger feature sets over $\operatorname{gsu}_{\mathrm{large}}$ are marginally lower to the supervised model and range between \SI{4.550000}{\percent} and \SI{7.440000}{\percent}. We already observed a similar result on the validation set in \cref{sec:hyperparameter-tuning}.
+Identical to the supervised case, our models consistently outperform their respective benchmarks. Gradient boosting with self-training surpasses \gls{GSU} (small) by \SI{3.350000}{\percent} on \gls{ISE} and \SI{5.440000}{\percent} on \gls{CBOE} in accuracy. Improvements for larger feature sets over \gls{GSU} (large) are marginally lower to the supervised model and range between \SI{4.550000}{\percent} and \SI{7.440000}{\percent}. This result extends the result for the validation set in \cref{sec:hyperparameter-tuning}.
 
-Pre-training is beneficial for the performance of Transformers on \gls{ISE} trades, improving over Transformer with random initialization by up to \SI{0.87000}{\percent}. Hence, the performance improvement from pre-training on the validation set carries over the test set. On the \gls{CBOE} dataset, pre-training hurts performance.
+Pre-training is beneficial for the performance of Transformers on \gls{ISE} trades, improving over Transformer with random initialization by up to \SI{0.87000}{\percent}. Hence, the performance improvement from pre-training on the validation set carries over to the test set. On the \gls{CBOE} dataset, pre-training hurts performance.
 
 \begin{table}[!ht]
     \centering
     \sisetup{table-number-alignment=right, table-format=7.0}
-    \caption[Contingency Tables of Semi-Supervised Classifiers]{This table contains the contingency tables of the semi-supervised classifiers on the \gls{CBOE} and \gls{ISE} test set for feature set classical, classical-size, and option. Cells sum the number of trades, correctly/falsely classified by both classifiers or one. Additionally, McNemar's test statistic $\chi^2$ and the associated $p$-value are reported.}
+    \caption[Contingency Tables of Semi-Supervised Classifiers]{Contingency tables of the semi-supervised classifiers on the \gls{CBOE} and \gls{ISE} test set for feature set classic, size, and option. Cells sum the number of trades, correctly/falsely classified by both classifiers or one. Additionally, McNemar's test statistic $\chi^2$ and the associated $p$-value are reported.}
     \label{tab:contigency-semi-supervised-classifiers}
     \begin{tabular}{@{}llSSSSSS@{}}
         \toprule
-                                                                          &           & \multicolumn{2}{c}{{\glsentryshort{FS} Classical}}      & \multicolumn{2}{c}{{\glsentryshort{FS} Size}}    & \multicolumn{2}{c}{{\glsentryshort{FS} Option}}                                            \\
+                                                                          &           & \multicolumn{2}{c}{{\glsentryshort{FS} Classic}}      & \multicolumn{2}{c}{{\glsentryshort{FS} Size}}    & \multicolumn{2}{c}{{\glsentryshort{FS} Option}}                                            \\
         \cmidrule(l){3-4}\cmidrule(l){5-6}\cmidrule(l){7-8}
-        \multicolumn{2}{l}{{$\downarrow$ Trans.$\rightarrow$ \gls{GBRT}}} & {Correct} & {Wrong}                                                 & {Correct}                                                  & {Wrong}                                                    & {Correct} & {Wrong}           \\
+        \multicolumn{2}{l}{{$\downarrow$  \gls{GBRT} $\rightarrow$ Trans.}} & {Correct} & {Wrong}                                                 & {Correct}                                                  & {Wrong}                                                    & {Correct} & {Wrong}           \\
         \midrule
         \gls{ISE}                                                         & Correct   & 5740391                                                 & 511603                                                     & 6658028                                                    & 457739    & 6665176 & 586696  \\
                                                                           & Wrong     & 635685                                                  & 2973897                                                    & 527023                                                     & 2218786   & 686768  & 1922936 \\         \addlinespace
@@ -222,41 +212,65 @@ \subsection{Results of Semi-supervised
     \end{tabular}
 \end{table}
 
-As evident from \cref{tab:contigency-semi-supervised-classifiers}, a vast majority of trades are classified by both classifiers correctly. For the \gls{ISE}, performance improvements in larger feature sets are driven by trades that are distinctly classified by both classifiers. In turn, at the \gls{CBOE}, the share of common classifications continues to grow. Performance differences between classifiers estimated by the McNemar test are significant.
+As evident from \cref{tab:contigency-semi-supervised-classifiers}, the majority of trades are classified by both classifiers correctly. For the \gls{ISE}, performance improvements in larger feature sets are driven by trades that are distinctly classified by both classifiers. In turn, at the \gls{CBOE}, the share of common classifications continues to grow. Performance differences between classifiers estimated by the McNemar test are significant at the \SI{1}{\percent} level.
+
+To summarize, semi-supervised variants of \glspl{GBRT} do not provide better generalization performance than supervised approaches, despite significantly higher training costs. Pre-training of Transformers improves performance on the \gls{ISE} sample but slightly deteriorates performance on the \gls{CBOE} set. We later evaluate if semi-supervised learning improves robustness if not performance.
+
+\subsubsection{Performance Discussion}\label{sec:performance-discussion}
+
+\textbf{Rule-based Classification}
+
+Basic classification rules leave every tenth trade unclassified. Thereby, the practical coverage of rule-based classification is lower than the theoretical coverage previously documented in \textcite[][40--42]{grauerOptionTradeClassification2022}, as it not just accounts for limitations in the rule but also in the data. In our datasets, hybrids have the advantage of leveraging multiple data sources, resulting in extended coverage. If, as in the layered \gls{GSU} method, the basic rules are strong individually, higher coverage is associated with better performance, as fewer trades are filled by a fallback mechanism. 
+
+Naturally, our machine learning classifiers can classify any trade and are robust to missingness, as models can learn alternating patterns for missing features. Consider the classification of a trade based on quotes at the trading venue, whereby the quotes are missing. The quote rule leaves the trade unclassified. Transformers, however, can attend to any other feature within the context and readily classify the trade. Similarly, \glspl{GBRT} can split by alternative criteria.
+
+% \vskip 1.3in
+
+\textbf{Supervised Classification}
+
+While statistically significant, performance differences between supervised \gls{GBRT} and Transformers, are relatively small compared to rule-based classification on the same feature sets. The result is aligned with \textcite[\checkmark][18941]{gorishniyRevisitingDeepLearning2021}. The study concludes, generally for tabular modeling, that neither Transformers nor \glspl{GBRT} are universally superior. Our results confirm this observation, in the context of trade classification.
+
+The results contradict those of \textcite[\checkmark][14--49]{ronenMachineLearningTrade2022}, who benchmark random forests and \gls{FFN} for trade classification in the equity and bond market and find clear dominance of the tree-based approach. Beyond differences in the market under study and variants, two methodological differences are evident, that explain the contrasting results. First, unlike \gls{FFN}, the FT-Transformer is tailored to learn on tabular data through being a rotationally-invariant learner. Second, our data pre-processing and feature engineering are adapted to the requirements of neural networks. Without these measures, tree-based approaches excel due to their robustness in handling skewed and missing data.
+
+Despite the lack of adaption to \gls{CBOE} data, the performance improvements are highest for the \gls{CBOE} dataset. Part of the strong performance on \gls{CBOE} trades hails from weaker benchmark performance, but also a stronger accuracy of classifiers on the smallest and mid-sized feature sets. Again, this result is in stark contrast to \textcite[\checkmark][32]{ronenMachineLearningTrade2022}, who report subpar performance for random forests. Their setting differs from ours, as they apply ensembles trained in the bond market to equity trades. Moreover, it is unclear if data preprocessing procedures are shared between both sets, which may hamper performance. 
+
+Relative to related works performing trade classification with supervised machine learning, the improvements are strong, as documented in \cref{app:literature-ml-tc}. As no other work studies the option market or identical model architectures, the results are indicative. The studies report improvements between \SI{1.1}{\percent} and \SI{13.3}{\percent} for their machine learning models over the benchmark. Our absolute improvements exceed all linear models, but the absolute improvements are smaller relative to some tree-based and deep learning models in \textcite[\checkmark][49]{ronenMachineLearningTrade2022}. At the same time, our models are trained on significantly fewer features and a static training set, requiring a fraction of the training cost. We believe, our conservative framing aligns well with scenarios, where trade classification is only a prerequisite to other empirical research.
+
+\textbf{Semi-supervised Classification}
 
 As no previous work performed semi-supervised classification, we focus our discussion on the performance difference between pre-training and self-training. On \gls{ISE} data, pre-training with the \gls{RTD} objective on unlabeled trades yields significantly stronger performance. The results align with the intuition from \cref{sec:extensions-to-transformer}, that pre-training exposes the model to a larger quantity of trades, which strengthens its ability to learn generalizable knowledge about the data useful in later trade classification. Also, the model is exposed to more diverse trades, as unlabeled trades are not restricted by customer type or trading activity, effectively preventing overfitting.  
 
-An explanation as to why pre-training improves performance on \gls{ISE} but not \gls{CBOE} trades, may be found in the pre-training data and setup. Trades used for pre-training are recorded at the \gls{ISE} only and are repeatedly shown to the model. While our pre-training objective is stochastic with different features being masked in each epoch, past research has shown that repeatedly presenting the same tokens in conjunction with a small-sized pre-training dataset, can degrade performance on the downstream classification task. For instance, \textcite[][27--28]{raffelExploringLimitsTransfer2020} document in the context of language modeling that a high degree of repetition encourages memorization in the model, but few repetitions are not harmful. As each trade is only shown $20\times$ to the model, but the size of the dataset is significantly smaller, the true impact remains unclear. Future work could revisit pre-training on a larger subset of LiveVol, incorporating trades from different exchanges, whereby each trade is only shown once to the model. We assume, that such a setup would, analogous to language modeling, improve performance on both \gls{ISE} and \gls{CBOE} trades, as the model is less prone to memorize data and learns a more diverse context.
+An explanation as to why pre-training improves performance on \gls{ISE} but not \gls{CBOE} trades, may be found in the pre-training data and setup. It is conceivable, that pre-training encodes exchange-specific knowledge. Trades used for pre-training are recorded at the \gls{ISE} only and are repeatedly shown to the model. While our pre-training objective is stochastic with different features being masked in each epoch, past research has shown that repeatedly presenting the same tokens in conjunction with a small-sized pre-training dataset, can degrade performance on the downstream classification task. For instance, \textcite[\checkmark][27--28]{raffelExploringLimitsTransfer2020} document in the context of language modeling that a high degree of repetition encourages memorization in the model, but few repetitions are not harmful. As each trade is only shown $20\times$ to the model, but the size of the dataset is significantly smaller, the true impact remains unclear. Future work could revisit pre-training on a larger subset of LiveVol, incorporating trades from different exchanges, whereby each trade is only shown once to the model. We assume, that such a setup would, analogous to language modeling, improve performance on both \gls{ISE} and \gls{CBOE} trades, as the model is less prone to memorize data and learns a more diverse context.
 
-Self-training with \glspl{GBRT} as a base learner generally performs worse than \glspl{GBRT} trained on labeled trades, which contradicts our initial motivation for self-training in \cref{sec:extensions-to-gradient-boosted-trees}. With the pseudo labels derived from high-confident predictions, the success of self-training hinges on the reliability of the predicted class probabilities. In our analysis of the default \gls{GBRT} in \cref{sec:training-and-tuning} we observed that the validation loss in terms of sample-wise cross-entropy loss stagnates due to a growing number of overconfident but erroneous predictions. Although we cannot confirm for the self-training classifier, due to the absence of true labels, it is conceivable, that the increased number of confident yet incorrect predictions, affects the generated pseudo labels. Without the ability to correct for errors, self-training performance on the validation and test set is directly impacted.
+Self-training with \glspl{GBRT} as a base learner generally performs worse than \glspl{GBRT} trained on labeled trades, which contradicts our initial motivation for self-training in \cref{sec:extensions-to-gradient-boosted-trees}. With the pseudo labels derived from high-confident predictions, the success of self-training hinges on the reliability of the predicted class probabilities. In our analysis of the default \gls{GBRT} in \cref{sec:training-and-tuning} we observe that the validation loss in terms of sample-wise cross-entropy loss stagnates due to a growing number of overconfident but erroneous predictions. Although we cannot confirm for the self-training classifier, due to the absence of true labels, it is conceivable, that the increased number of confident yet incorrect predictions, affects the generated pseudo labels. Without the ability to correct for errors, self-training performance on the validation and test set is directly impacted.
 
-To summarize, unrewarded for higher training costs, semi-supervised variants of \glspl{GBRT} do not provide better generalization performance than supervised approaches. Pre-training of Transformers improves performance on the \gls{ISE} sample but slightly deteriorates performance on the \gls{CBOE} set. We subsequently evaluate if semi-supervised learning improves robustness if not performance.
+\textbf{Limitations}
 
-\subsection{Robustness of Results}\label{sec:robustness-checks}
+Despite the strong performance of our approaches, it's important to acknowledge the limitations. Inference of the classifiers is computationally cheap, but training requires a significant amount of compute. To make training feasible, great effort is spent on utilizing computing resources by optimizing memory transfers, compute graphs, and data representation. In cases where computing resources are scarce or classification accuracy is not the mere target, we advocate for heuristics, such as the \gls{GSU} method, which balances computational cost and performance.
 
-% \todo{call them long-term options / expiring options?}
+All of our classifiers require some labeled instances for training. If the true label cannot be inferred from trades or generating labeled data is wasteful, our approaches are not applicable. For cases, where trades are partially labeled, our pre-trained FT-Transformer offers a viable alternative to rule-based classification. Next, we analyze the robustness of our classifiers.
 
-To assess the robustness of our algorithms, we partition the test sets into sub-samples along seven dimensions: option type, security type, trade size, year, time to maturity, moneyness, as well as proximity to quotes. Comparable robustness checks have been previously conducted in \textcite[][47]{grauerOptionTradeClassification2022} as well as  \textcite[][890--892]{savickasInferringDirectionOption2003}, strengthening comparability across different works.\footnote{Despite all efforts, when comparing with \textcite[][47--52]{grauerOptionTradeClassification2022}, one has to be aware that evaluation periods and fallback strategies differ. Furthermore, the authors group similar algorithms. Thus, we recommend relying on our estimates of their rules.}
+\subsection{Robustness}\label{sec:robustness}
 
-Our results are tabulated \cref{tab:diff-ise-gbm,tab:diff-cboe-gbm,tab:diff-ise-transformer,tab:diff-cboe-transformer,tab:diff-ise-gbm-semi,tab:diff-cboe-gbm-semi}, separately for \glspl{GBRT} and Transformers as well as exchanges.
+\subsubsection{Robustness Results}\label{sec:robustness-results}
 
-\clearpage
+To assess the robustness of our algorithms, we partition the test sets into sub-samples along seven dimensions: option type, security type, trade size, year, time to maturity, moneyness, and proximity to quotes. Comparable robustness checks have been previously conducted in \textcite[\checkmark][47--50]{grauerOptionTradeClassification2022} as well as \textcite[\checkmark][890--892]{savickasInferringDirectionOption2003}, strengthening comparability across different works.\footnote{When comparing with \textcite[\checkmark][47--50]{grauerOptionTradeClassification2022}, one has to be aware that evaluation periods and fallback strategies differ. Furthermore, the authors group similar algorithms. Thus, we recommend relying on our estimates of their rules.} Our results are tabulated \cref{tab:diff-ise-gbm,tab:diff-cboe-gbm,tab:diff-ise-transformer,tab:diff-cboe-transformer,tab:diff-ise-gbm-semi,tab:diff-cboe-gbm-semi}, separately for \glspl{GBRT} and Transformers as well as exchanges.
 
 \textbf{Gradient Boosting}
 
-Performance improvements of \glspl{GBRT} are consistent for calls and puts across all feature sets and exchanges. Conditional on the security type of the underlying, \gls{GBRT} achieves the largest improvements for index options in the \gls{CBOE} sample, but perform slightly worse than rule-based approaches on the \gls{ISE} set. On both datasets, accuracies are lowest for index options, which corroborates with the literature on rule-based classification.
+Performance improvements of \glspl{GBRT} are consistent for calls and puts across all feature sets and exchanges. Conditional on the security type of the underlying, \gls{GBRT} achieves the largest improvements for index options in the \gls{CBOE} sample, but performs worse than rule-based approaches on the \gls{ISE} set. Accuracies are lowest for index options, which corroborates with the literature on rule-based classification.
 
-The performance is stable for different trade sizes and over time. Similarly, accuracy improvements are comparable for different maturities and moneyness ratios. Aligning with rule-based approaches, accuracies are lowest for option trades with long maturities and deep \gls{ITM} options, as reported in \textcite[][22]{grauerOptionTradeClassification2022}. The addition of option-specific features has annealing effects on accuracies by moneyness ratios and maturities.
+The performance is stable for different trade sizes and over time. Similarly, accuracy improvements are comparable for different maturities and moneyness ratios. Aligning with rule-based approaches, accuracies are lowest for option trades with long maturities and deep \gls{ITM} options, as reported in \textcite[\checkmark][22]{grauerOptionTradeClassification2022}. The addition of option-specific features has annealing effects on accuracies by moneyness ratios and maturities.
 
-\gls{GBRT} achieve particularly strong results for trades at the quotes or if quote data at the exchange level is absent. In these subsets, improvements reach up to \SI{16.01}{\percent} in the \gls{CBOE} and thus tighten the gap between trades inside or at the quotes. Consistent across all feature sets and exchanges, \glspl{GBRT} fail to improve upon classical rules for trades outside the spread, underperforming the benchmark \SI{-0.89}{\percent} to \SI{-5.43}{\percent}. We identify the strong performance of quote-based classification on these trades as a reason, that poses major challenges.
+\gls{GBRT} achieve particularly strong results for trades at the quotes or if quote data at the exchange level is absent. In these subsets, improvements reach up to \SI{16.01}{\percent} in the \gls{CBOE} and thus tighten the gap between trades inside or at the quotes. Consistent across all feature sets and exchanges, \glspl{GBRT} fail to improve upon classical rules for trades outside the spread, underperforming the benchmark \SI{-0.89}{\percent} to \SI{-5.43}{\percent}. We identify the strong performance of quote-based classification on these trades as a reason, that poses challenges.
 
 \begin{table}[ht!]
     \centering
-    \caption[Robustness of Gradient-Boosting on \glsentryshort{ISE}]{This table presents accuracies of \glspl{GBRT} across various sub-samples of the \gls{ISE} test set over time and by proximity to quotes, as well as option characteristics such as option and security type, time to maturity in days, and moneyness. The security type category "Others" encompasses options written on \glspl{ETF}, mutual funds, and \glspl{ADR}. The absolute improvements over $\operatorname{gsu}_{\mathrm{small}}$ for the feature set classical and $\operatorname{gsu}_{\mathrm{large}}$ for all other feature sets are given in +/- column.}
+    \caption[Robustness of Gradient-Boosting on \glsentryshort{ISE}]{Accuracies of \glspl{GBRT} across all sub-samples of the \gls{ISE} test set over time and by proximity to quotes, as well as option characteristics such as option and security type, time to maturity in days, and moneyness. The security type category "Others" encompasses options written on \glspl{ETF}, mutual funds, and \glspl{ADR}. The absolute improvements over \gls{GSU} (small) for the feature set classic and \gls{GSU} (large) for all other feature sets are given in the +/- column.}
     \label{tab:diff-ise-gbm}
     \begin{tabular}{lSSSSSS@{}}
         \toprule
-        {}                           & \multicolumn{2}{c}{\glsentryshort{FS} Classical} & \multicolumn{2}{c}{\glsentryshort{FS} Size} & \multicolumn{2}{c}{\glsentryshort{FS} Option}                                        \\ \cmidrule(lr){2-3}\cmidrule(lr){4-5}\cmidrule(lr){6-7}
+        {}                           & \multicolumn{2}{c}{\glsentryshort{FS} Classic} & \multicolumn{2}{c}{\glsentryshort{FS} Size} & \multicolumn{2}{c}{\glsentryshort{FS} Option}                                        \\ \cmidrule(lr){2-3}\cmidrule(lr){4-5}\cmidrule(lr){6-7}
         {}                           & {Acc. in \%}                                     & {+/-}                                                 & {Acc. in \%}                                  & {+/-}     & {Acc. in \%} & {+/-}     \\\midrule
         \multicolumn{7}{l}{ Option Type}                                                                                                                                                                                               \\
         \tabindent  Call             & 62.890486                                        & 3.720000                                              & 71.884647                                     & 4.480000  & 73.647971    & 6.240000  \\
@@ -310,11 +324,11 @@ \subsection{Robustness of Results}\label{sec:robustness-checks}
 
 \begin{table}[h!]
     \centering
-    \caption[Robustness of Gradient-Boosting on \glsentryshort{CBOE}]{This table presents accuracies of \glspl{GBRT} across various sub-samples of the \gls{CBOE} test set over time and by proximity to quotes, as well as option characteristics such as option and security type, time to maturity in days, and moneyness. The security type category "Others" encompasses options written on \glspl{ETF}, mutual funds, and \glspl{ADR}. The absolute improvements over $\operatorname{gsu}_{\mathrm{small}}$ for the feature set classical and $\operatorname{gsu}_{\mathrm{large}}$ for all other feature sets are given in +/- column.}
+    \caption[Robustness of Gradient-Boosting on \glsentryshort{CBOE}]{Accuracies of \glspl{GBRT} across all sub-samples of the \gls{CBOE} test set over time and by proximity to quotes, as well as option characteristics such as option and security type, time to maturity in days, and moneyness. The security type category "Others" encompasses options written on \glspl{ETF}, mutual funds, and \glspl{ADR}. The absolute improvements over \gls{GSU} (small) for the feature set classic and \gls{GSU} (large) for all other feature sets are given in the +/- column.}
     \label{tab:diff-cboe-gbm}
     \begin{tabular}{lSSSSSS@{}}
         \toprule
-        {}                         & \multicolumn{2}{c}{\glsentryshort{FS} Classical} & \multicolumn{2}{c}{\glsentryshort{FS} Size} & \multicolumn{2}{c}{\glsentryshort{FS} Option}                                        \\ \cmidrule(lr){2-3}\cmidrule(lr){4-5}\cmidrule(lr){6-7}
+        {}                         & \multicolumn{2}{c}{\glsentryshort{FS} Classic} & \multicolumn{2}{c}{\glsentryshort{FS} Size} & \multicolumn{2}{c}{\glsentryshort{FS} Option}                                        \\ \cmidrule(lr){2-3}\cmidrule(lr){4-5}\cmidrule(lr){6-7}
         {}                         & {Acc. in \%}                                     & {+/-}                                                 & {Acc. in \%}                                  & {+/-}     & {Acc. in \%} & {+/-}     \\\midrule
         \multicolumn{7}{l}{ Option Type}                                                                                                                                                                                             \\
         \tabindent Call            & 65.505083                                        & 5.370000                                              & 71.707057                                     & 5.770000  & 74.283388    & 8.350000  \\
@@ -368,21 +382,21 @@ \subsection{Robustness of Results}\label{sec:robustness-checks}
 
 \textbf{FT-Transformer}
 
-Performance results of Transformers are robust across all tested dimensions. The accuracy is approximately equal for calls and puts. We observe, that the benchmark performance of puts is consistently higher in our sub-samples, which contrasts the finding of \textcite[][22]{grauerOptionTradeClassification2022}.
+Performance results of Transformers are robust across all tested dimensions. The accuracy is approximately equal for calls and puts. We observe, that the benchmark performance of puts is consistently higher in our sub-samples, which contrasts the finding of \textcite[\checkmark][22]{grauerOptionTradeClassification2022}.
 
-Similar to \glspl{GBRT}, the FT-Transformer slightly underperforms the benchmark for index options in the \gls{ISE} sample. Even though the effect reverses on the \gls{CBOE} set, accuracies for index options are lower than those of any other underlying. Hence, we can extend the finding of \textcites[][22]{grauerOptionTradeClassification2022}[][9]{savickasInferringDirectionOption2003} that index options are notoriously difficult to classify to machine learning-based approaches.
+Similar to \glspl{GBRT}, the FT-Transformer slightly underperforms the benchmark for index options in the \gls{ISE} sample. Even though the effect reverses on the \gls{CBOE} set, accuracies for index options are lower than those of any other underlying. Hence, we can extend the finding of \textcites[\checkmark][22]{grauerOptionTradeClassification2022}[\checkmark][886]{savickasInferringDirectionOption2003} that index options are notoriously difficult to classify to machine learning-based approaches.
 
-Classification is more accurate for near-expiring or deep \gls{ITM} options. In this sense, our finding contradicts the observation of \textcite[][891]{savickasInferringDirectionOption2003} made for rule-based classification. Again, we document that the addition of option-specific features, such as maturity or moneyness smooths out differences across maturity and moneyness levels. We defer discussing this aspect to \cref{sec:feature-importance}.
+Classification is more accurate for near-expiring or deep \gls{ITM} options. In this sense, our finding contradicts the observation of \textcite[\checkmark][891]{savickasInferringDirectionOption2003} made for rule-based classification. Again, we document that the addition of option-specific features, such as maturity or moneyness smooths out differences across maturity and moneyness levels.
 
 Lastly, the FT-Transformers perform well on trades at the quotes. When trained on \gls{ISE} data it greatly outperforms for trades at the quotes, despite that some of the benchmarks contain explicit overrides from the trade size rule. Vice versa, the FT-Transformer fails to meet benchmark performance for trades outside the spread.
 
 \begin{table}[h!]
     \centering
-    \caption[Robustness of FT-Transformer on \glsentryshort{ISE}]{This table presents accuracies of the FT-Transformer across various sub-samples of the \gls{ISE} test set over time and by proximity to quotes, as well as option characteristics such as option and security type, time to maturity in days, and moneyness. The security type category "Others" encompasses options written on \glspl{ETF}, mutual funds, and \glspl{ADR}. The absolute improvements over $\operatorname{gsu}_{\mathrm{small}}$ for the feature set classical and $\operatorname{gsu}_{\mathrm{large}}$ for all other feature sets are given in +/- column.}
+    \caption[Robustness of FT-Transformer on \glsentryshort{ISE}]{Accuracies of the FT-Transformer across all sub-samples of the \gls{ISE} test set over time and by proximity to quotes, as well as option characteristics such as option and security type, time to maturity in days, and moneyness. The security type category "Others" encompasses options written on \glspl{ETF}, mutual funds, and \glspl{ADR}. The absolute improvements over \gls{GSU} (small) for the feature set classic and \gls{GSU} (large) for all other feature sets are given in the +/- column.}
     \label{tab:diff-ise-transformer}
     \begin{tabular}{lSSSSSS@{}}
         \toprule
-        {}                          & \multicolumn{2}{c}{\glsentryshort{FS} Classical} & \multicolumn{2}{c}{\glsentryshort{FS} Size} & \multicolumn{2}{c}{\glsentryshort{FS} Option}                                        \\ \cmidrule(lr){2-3}\cmidrule(lr){4-5}\cmidrule(lr){6-7}
+        {}                          & \multicolumn{2}{c}{\glsentryshort{FS} Classic} & \multicolumn{2}{c}{\glsentryshort{FS} Size} & \multicolumn{2}{c}{\glsentryshort{FS} Option}                                        \\ \cmidrule(lr){2-3}\cmidrule(lr){4-5}\cmidrule(lr){6-7}
         {}                          & {Acc. in \%}                                     & {+/-}                                                 & {Acc. in \%}                                  & {+/-}     & {Acc. in \%} & {+/-}     \\\midrule
         \multicolumn{7}{l}{ Option Type}                                                                                                                                                                                              \\
         \tabindent Call             & 62.991514                                        & 3.820000                                              & 72.099064                                     & 4.690000  & 73.484904    & 6.080000  \\
@@ -434,11 +448,11 @@ \subsection{Robustness of Results}\label{sec:robustness-checks}
 
 \begin{table}[h!]
     \centering
-    \caption[Robustness of FT-Transformer on \glsentryshort{CBOE}]{This table presents accuracies of the FT-Transformer across various sub-samples of the \gls{CBOE} test set over time and by proximity to quotes, as well as option characteristics such as option and security type, time to maturity in days, and moneyness. The security type category "Others" encompasses options written on \glspl{ETF}, mutual funds, and \glspl{ADR}. The absolute improvements over $\operatorname{gsu}_{\mathrm{small}}$ for the feature set classical and $\operatorname{gsu}_{\mathrm{large}}$ for all other feature sets are given in +/- column.}
+    \caption[Robustness of FT-Transformer on \glsentryshort{CBOE}]{Accuracies of the FT-Transformer across all sub-samples of the \gls{CBOE} test set over time and by proximity to quotes, as well as option characteristics such as option and security type, time to maturity in days, and moneyness. The security type category "Others" encompasses options written on \glspl{ETF}, mutual funds, and \glspl{ADR}. The absolute improvements over \gls{GSU} (small) for the feature set classic and \gls{GSU} (large) for all other feature sets are given in the +/- column.}
     \label{tab:diff-cboe-transformer}
     \begin{tabular}{lSSSSSS@{}}
         \toprule
-        {}                          & \multicolumn{2}{c}{\glsentryshort{FS} Classical} & \multicolumn{2}{c}{\glsentryshort{FS} Size} & \multicolumn{2}{c}{\glsentryshort{FS} Option}                                       \\ \cmidrule(lr){2-3}\cmidrule(lr){4-5}\cmidrule(lr){6-7}
+        {}                          & \multicolumn{2}{c}{\glsentryshort{FS} Classic} & \multicolumn{2}{c}{\glsentryshort{FS} Size} & \multicolumn{2}{c}{\glsentryshort{FS} Option}                                       \\ \cmidrule(lr){2-3}\cmidrule(lr){4-5}\cmidrule(lr){6-7}
         {}                          & {Acc. in \%}                                     & {+/-}                                                 & {Acc. in \%}                                  & {+/-}    & {Acc. in \%} & {+/-}     \\\midrule
         \multicolumn{7}{l}{ Option Type}                                                                                                                                                                                             \\
         \tabindent Call             & 65.628907                                        & 5.490000                                              & 71.945453                                     & 6.010000 & 74.579113    & 8.640000  \\
@@ -492,17 +506,17 @@ \subsection{Robustness of Results}\label{sec:robustness-checks}
 
 \textbf{Gradient-Boosting With Self-Training}
 
-We analyze the robustness of \gls{GBRT} with self-training on \gls{CBOE} data in \cref{tab:diff-ise-gbm-semi} and \gls{CBOE} data in \cref{tab:diff-cboe-gbm-semi}. Similar to what we observe for the standard \glspl{GBRT}, \glspl{GBRT} with self-training outperforms the respective benchmarks on almost all subsets. The only exceptions are index options and options traded outside the quotes, where the model performs worse than \gls{GSU} method (small/large).
+We analyze the robustness of \gls{GBRT} with self-training on \gls{CBOE} data in \cref{tab:diff-ise-gbm-semi} and \gls{CBOE} data in \cref{tab:diff-cboe-gbm-semi}. Similar to what we observe for the standard \glspl{GBRT}, \glspl{GBRT} with self-training outperforms the respective benchmarks on almost all subsets. The only exceptions are index options and options traded outside the quotes, where the model performs worse than \gls{GSU} method (small)/(large).
 
-Compared to the standard \glspl{GBRT}, performance degrades across almost all subsets. Quantitatively, we find no improvements in robustness as performance differences between sub-samples are of the same magnitude and the performance gap between rule-based classification extends for index options and trades outside the spread.
+Compared to the standard \glspl{GBRT}, performance degrades across almost all subsets. Quantitatively, no improvements in robustness as performance differences between sub-samples are of the same magnitude and the performance gap between rule-based classification extends for index options and trades outside the spread are observable.
 
 \begin{table}
     \centering
-    \caption[Robustness of Gradient-Boosting With Self-Training on \glsentryshort{ISE}]{This table presents accuracies of the \gls{GBRT} with self-training across various sub-samples of the \gls{ISE} test set over time and by proximity to quotes, as well as option characteristics such as option and security type, time to maturity in days, and moneyness. The security type category "Others" encompasses options written on \glspl{ETF}, mutual funds, and \glspl{ADR}. The absolute improvements over $\operatorname{gsu}_{\mathrm{small}}$ for the feature set classical and $\operatorname{gsu}_{\mathrm{large}}$ for all other feature sets are given in +/- column.}
+    \caption[Robustness of Gradient-Boosting With Self-Training on \glsentryshort{ISE}]{Accuracies of the \gls{GBRT} with self-training across all sub-samples of the \gls{ISE} test set over time and by proximity to quotes, as well as option characteristics such as option and security type, time to maturity in days, and moneyness. The security type category "Others" encompasses options written on \glspl{ETF}, mutual funds, and \glspl{ADR}. The absolute improvements over \gls{GSU} (small) for the feature set classic and \gls{GSU} (large) for all other feature sets are given in the +/- column.}
     \label{tab:diff-ise-gbm-semi}
     \begin{tabular}{lSSSSSS@{}}
         \toprule
-        {}                          & \multicolumn{2}{c}{\glsentryshort{FS} Classical} & \multicolumn{2}{c}{\glsentryshort{FS} Size} & \multicolumn{2}{c}{\glsentryshort{FS} Option}                                        \\ \cmidrule(lr){2-3}\cmidrule(lr){4-5}\cmidrule(lr){6-7}
+        {}                          & \multicolumn{2}{c}{\glsentryshort{FS} Classic} & \multicolumn{2}{c}{\glsentryshort{FS} Size} & \multicolumn{2}{c}{\glsentryshort{FS} Option}                                        \\ \cmidrule(lr){2-3}\cmidrule(lr){4-5}\cmidrule(lr){6-7}
         {}                          & {Acc. in \%}                                     & {+/-}                                                 & {Acc. in \%}                                  & {+/-}     & {Acc. in \%} & {+/-}     \\\midrule
         \multicolumn{7}{l}{ Option Type}                                                                                                                                                                                              \\
         \tabindent Call             & 62.652675                                        & 3.480000                                              & 71.692310                                     & 4.280000  & 73.080774    & 5.670000  \\
@@ -554,11 +568,11 @@ \subsection{Robustness of Results}\label{sec:robustness-checks}
 
 \begin{table}[!ht]
     \centering
-    \caption[Robustness of Gradient-Boosting With Self-Training on \glsentryshort{CBOE}]{This table presents accuracies of the \gls{GBRT} with self-training across various sub-samples of the \gls{CBOE} test set over time and by proximity to quotes, as well as option characteristics such as option and security type, time to maturity in days, and moneyness. The security type category "Others" encompasses options written on \glspl{ETF}, mutual funds, and \glspl{ADR}. The absolute improvements over $\operatorname{gsu}_{\mathrm{small}}$ for the feature set classical and $\operatorname{gsu}_{\mathrm{large}}$ for all other feature sets are given in +/- column.}
+    \caption[Robustness of Gradient-Boosting With Self-Training on \glsentryshort{CBOE}]{Accuracies of the \gls{GBRT} with self-training across all sub-samples of the \gls{CBOE} test set over time and by proximity to quotes, as well as option characteristics such as option and security type, time to maturity in days, and moneyness. The security type category "Others" encompasses options written on \glspl{ETF}, mutual funds, and \glspl{ADR}. The absolute improvements over \gls{GSU} (small) for the feature set classic and \gls{GSU} (large) for all other feature sets are given in the +/- column.}
     \label{tab:diff-cboe-gbm-semi}
     \begin{tabular}{lSSSSSS@{}}
         \toprule
-        {}                           & \multicolumn{2}{c}{\glsentryshort{FS} Classical} & \multicolumn{2}{c}{\glsentryshort{FS} Size} & \multicolumn{2}{c}{\glsentryshort{FS} Option}                                        \\ \cmidrule(lr){2-3}\cmidrule(lr){4-5}\cmidrule(lr){6-7}
+        {}                           & \multicolumn{2}{c}{\glsentryshort{FS} Classic} & \multicolumn{2}{c}{\glsentryshort{FS} Size} & \multicolumn{2}{c}{\glsentryshort{FS} Option}                                        \\ \cmidrule(lr){2-3}\cmidrule(lr){4-5}\cmidrule(lr){6-7}
         {}                           & {Acc. in \%}                                     & {+/-}                                                 & {Acc. in \%}                                  & {+/-}     & {Acc. in \%} & {+/-}     \\\midrule
         \multicolumn{7}{l}{ Option Type}                                                                                                                                                                                               \\
         \tabindent  Call             & 65.684960                                        & 5.550000                                              & 71.679647                                     & 5.740000  & 73.861831    & 7.930000  \\
@@ -612,21 +626,17 @@ \subsection{Robustness of Results}\label{sec:robustness-checks}
 
 \textbf{Transformer With Self-Training}
 
-Transformers with pre-training objectives outperform the benchmark in all subsets apart from index options and trades outside the quotes. For \gls{ISE} trades in \cref{tab:diff-ise-transformer-semi} pre-training improves performance across subsets, reaching accuracies greater than \SI{86}{\percent}. The only exception is index options, where the performance gap slightly widens. Deep-\gls{ITM}  options and options with long maturity profit the most from the introduction of option features. 
-
-For trades at the \gls{CBOE} performance improvements associated with pre-training are slightly lower across several sub-groups. Positively, pre-training improves robustness, as the performance gap to the benchmarks narrows for trades outside the quotes. The results in conjunction with the identical model architecture suggest, that pre-training on unlabeled trades encodes exchange-specific knowledge, which improves performance and robustness on \gls{ISE} trades, but does not universally profit \gls{CBOE} trades. 
-
-So far it remains open, as to why most classifiers struggle to correctly classify index options and options traded outside the quotes. Index options are notoriously difficult to classify by standard algorithms, as unanimously documented in \textcites[][898-898]{savickasInferringDirectionOption2003}[][20]{grauerOptionTradeClassification2022}. \textcite[][898-898]{savickasInferringDirectionOption2003} trace back the low accuracy to the intensified use of complex trades in index option trading, such as bull spreads, which typically involve simultaneous buys and sells of options. Conceptually, it remains unclear if the components should be classified separately or as single complex trade. The explanation sheds light on why classification is difficult as a whole, but it does not address why accuracies trail the benchmark. \todo{We cannot test as we do not have simultaneous buy and sell orders?... ok different option series, but no other trade} Some insights can be gained from the data distribution: index trades make up only \SI{1.0731}{\percent} of all trades on the \gls{ISE} training set, resulting in a highly imbalanced distribution of the security type. Consequently, the model has fewer index option samples to train on and is susceptible to overfitting if it learns distinguishable patterns for security types. A sample weighting scheme could place more emphasis on index options. 
+Transformers with pre-training objectives outperform the benchmark in all subsets apart from index options and trades outside the quotes. For \gls{ISE} trades in \cref{tab:diff-ise-transformer-semi} pre-training improves performance across subsets, reaching accuracies greater than \SI{86}{\percent}. The only exception is index options, where the performance gap slightly widens. Deep \gls{ITM} options and options with long maturities profit the most from the introduction of option features. 
 
-In our test sets options traded outside the quotes can be reliably classified with the quote rule, which aligns with the intuition that customers are willing to trade at an additional liquidity premium, hence outside the quotes. We suspect the reason, why our methods fail to learn in this simple pattern in the infrequent distribution in \SI{0.7535}{\percent} of the \gls{ISE} dataset. Following our reasoning from above, the model can overfit the training samples more easily, eventually resulting in poor out-of-sample performance. As both subsets account for only a fraction of the entire set and differences in performance are only minor. We conclude that our approaches are stable across multiple dimensions and between exchanges.
+For trades at the \gls{CBOE} performance improvements associated with pre-training are slightly lower across most sub-groups. Pre-training improves robustness, as the performance gap to the benchmarks narrows for trades outside the quotes.
 
 \begin{table}[!ht]
     \centering
-    \caption[Robustness of FT-Transformer With Pre-Training on \glsentryshort{ISE}]{This table presents accuracies of the FT-Transformer with pre-training across various sub-samples of the \gls{ISE} test set over time and by proximity to quotes, as well as option characteristics such as option and security type, time to maturity in days, and moneyness. The security type category "Others" encompasses options written on \glspl{ETF}, mutual funds, and \glspl{ADR}. The absolute improvements over $\operatorname{gsu}_{\mathrm{small}}$ for the feature set classical and $\operatorname{gsu}_{\mathrm{large}}$ for all other feature sets are given in +/- column.}
+    \caption[Robustness of FT-Transformer With Pre-Training on \glsentryshort{ISE}]{Accuracies of the FT-Transformer with pre-training across all sub-samples of the \gls{ISE} test set over time and by proximity to quotes, as well as option characteristics such as option and security type, time to maturity in days, and moneyness. The security type category "Others" encompasses options written on \glspl{ETF}, mutual funds, and \glspl{ADR}. The absolute improvements over \gls{GSU} (small) for the feature set classic and \gls{GSU} (large) for all other feature sets are given in the +/- column.}
     \label{tab:diff-ise-transformer-semi}
     \begin{tabular}{lSSSSSS@{}}
         \toprule
-        {}                          & \multicolumn{2}{c}{\glsentryshort{FS} Classical} & \multicolumn{2}{c}{\glsentryshort{FS} Size} & \multicolumn{2}{c}{\glsentryshort{FS} Option}                                        \\ \cmidrule(lr){2-3}\cmidrule(lr){4-5}\cmidrule(lr){6-7}
+        {}                          & \multicolumn{2}{c}{\glsentryshort{FS} Classic} & \multicolumn{2}{c}{\glsentryshort{FS} Size} & \multicolumn{2}{c}{\glsentryshort{FS} Option}                                        \\ \cmidrule(lr){2-3}\cmidrule(lr){4-5}\cmidrule(lr){6-7}
         {}                          & {Acc. in \%}                                     & {+/-}                                                 & {Acc. in \%}                                  & {+/-}     & {Acc. in \%} & {+/-}     \\\midrule
         \multicolumn{7}{l}{ Option Type}                                                                                                                                                                                              \\
         \tabindent Call             & 63.749963                                        & 4.580000                                              & 72.330180                                     & 4.920000  & 74.065147    & 6.660000  \\
@@ -678,11 +688,11 @@ \subsection{Robustness of Results}\label{sec:robustness-checks}
 
 \begin{table}[!ht]
     \centering
-    \caption[Robustness of FT-Transformer With Pre-Training on \glsentryshort{CBOE}]{This table presents accuracies of the FT-Transformer with pre-training across various sub-samples of the \gls{CBOE} test set over time and by proximity to quotes, as well as option characteristics such as option and security type, time to maturity in days, and moneyness. The security type category "Others" encompasses options written on \glspl{ETF}, mutual funds, and \glspl{ADR}. The absolute improvements over $\operatorname{gsu}_{\mathrm{small}}$ for the feature set classical and $\operatorname{gsu}_{\mathrm{large}}$ for all other feature sets are given in +/- column.}
+    \caption[Robustness of FT-Transformer With Pre-Training on \glsentryshort{CBOE}]{Accuracies of the FT-Transformer with pre-training across all sub-samples of the \gls{CBOE} test set over time and by proximity to quotes, as well as option characteristics such as option and security type, time to maturity in days, and moneyness. The security type category "Others" encompasses options written on \glspl{ETF}, mutual funds, and \glspl{ADR}. The absolute improvements over \gls{GSU} (small) for the feature set classic and \gls{GSU} (large) for all other feature sets are given in the +/- column.}
     \label{tab:diff-cboe-transformer-semi}
     \begin{tabular}{lSSSSSS@{}}
         \toprule
-        {}                          & \multicolumn{2}{c}{\glsentryshort{FS} Classical} & \multicolumn{2}{c}{\glsentryshort{FS} Size} & \multicolumn{2}{c}{\glsentryshort{FS} Option}                                        \\ \cmidrule(lr){2-3}\cmidrule(lr){4-5}\cmidrule(lr){6-7}
+        {}                          & \multicolumn{2}{c}{\glsentryshort{FS} Classic} & \multicolumn{2}{c}{\glsentryshort{FS} Size} & \multicolumn{2}{c}{\glsentryshort{FS} Option}                                        \\ \cmidrule(lr){2-3}\cmidrule(lr){4-5}\cmidrule(lr){6-7}
         {}                          & {Acc. in \%}                                     & {+/-}                                                 & {Acc. in \%}                                  & {+/-}     & {Acc. in \%} & {+/-}     \\\midrule
         \multicolumn{7}{l}{ Option Type}                                                                                                                                                                                              \\
         \tabindent Call             & 65.012528                                        & 4.880000                                              & 71.552840                                     & 5.620000  & 74.134277    & 8.200000  \\
@@ -734,26 +744,38 @@ \subsection{Robustness of Results}\label{sec:robustness-checks}
 
 \clearpage
 
+\subsubsection{Robustness Discussion}\label{sec:robustness-discussion}
+
+All our classifiers achieve performance improvements that are robust across most subsets. Nevertheless, most classifiers struggle to correctly classify index options and options traded outside the quotes. We discuss both aspects.
+
+Index options are notoriously difficult to classify by standard algorithms, as unanimously documented in \textcites[\checkmark][898--899]{savickasInferringDirectionOption2003}[\checkmark][22]{grauerOptionTradeClassification2022}. \textcite[\checkmark][898--898]{savickasInferringDirectionOption2003} trace back the low accuracy to the intensified use of complex trades in index options trading, such as bull spreads, which typically involve simultaneous buys and sells of options. Conceptually, it remains unclear if the components should be classified separately or as single complex trade. We cannot verify this hypothesis as our labeling procedure does not guarantee that all components are contained in the final sample. Also, it only explains why classification is difficult as a whole, but it does not address why accuracies trail the benchmark. Some insights can be gained from the data distribution: index trades make up only \SI{1.0731}{\percent} of all trades on the \gls{ISE} training set, resulting in a highly imbalanced distribution of the security type. Consequently, the model has fewer index option samples to train on and is susceptible to overfitting if it learns distinguishable patterns for security types. A sample weighting scheme could place more emphasis on index options. 
+
+In our test sets options traded outside the quotes can be reliably classified with the quote rule, which aligns with the intuition that customers are willing to trade at an additional liquidity premium, hence outside the quotes. We suspect the reason, why our methods fail to learn in this simple pattern in the infrequent distribution in \SI{0.7535}{\percent} of the \gls{ISE} dataset. Following our reasoning from above, the model can overfit the training samples more easily, eventually resulting in poor out-of-sample performance. 
+
+As both subsets account for only a fraction of the entire set and differences in performance are only minor. We conclude that our approaches are stable across multiple dimensions and between exchanges.
+
 \subsection{Feature Importance}\label{sec:feature-importance}
 
-Transformers outperform all rule-based approaches by a large margin on the \gls{ISE} dataset. To gain insights into the factors driving this performance, we conduct a qualitative analysis of the attention mechanism and learned embeddings. For an evaluation of feature importances, that suffices for a cross-model comparison, we utilize \gls{SAGE}, building upon our rationale from \cref{sec:feature-importance-measure}.
+Transformers outperform all rule-based approaches by a large margin on the \gls{ISE} dataset. To gain insights into the network, we conduct a qualitative analysis of the attention mechanism and learned embeddings. For an evaluation of feature importances, that suffices for a cross-model comparison, we use \gls{SAGE}, building upon our rationale from \cref{sec:feature-importance-measure}.
+
+\subsubsection{Feature Importance Results}\label{sec:feature-importance-results}
 
 \textbf{Attention Visualization}
 
-The analysis of attention follows a top-down approach. Initially, we generate model-wide attention maps using the methodology of \textcite[][2--4]{cheferTransformerInterpretabilityAttention2021}. Subsequently, we detail the analysis by probing specific attention heads adapting a procedure of \textcite[][4]{clarkWhatDoesBERT2019}.
+The analysis of attention follows a top-down approach. Initially, we generate model-wide attention maps using the methodology of \textcite[\checkmark][784--786]{cheferTransformerInterpretabilityAttention2021}, then detail the analysis by probing specific attention heads adapting a procedure of \textcite[\checkmark][279]{clarkWhatDoesBERT2019}.
 
-Attention maps offer transparency at the trade or dataset level. To aid visualization, we focus on subsets of trades, where the performance of Transformers is particularly strong and select \num{16} trades at the quotes and \num{16} midspread trades from the \gls{ISE} test set. The resulting attention maps are shown in \cref{fig:attention-maps-ise}.
+Attention maps offer transparency at the dataset or trade level. To aid visualization, we focus on subsets of trades, where the performance of Transformers is particularly strong, and select \num{16} trades at the quotes and \num{16} midspread trades from the \gls{ISE} test set. The resulting attention maps are shown in \cref{fig:attention-maps-ise}.
 
 \begin{figure}[h!]
     \centering
     \includegraphics[width=1\textwidth]{attention_maps_ise_quotes_mid.pdf}
-    \caption[Attention Maps of FT-Transformer]{Attention maps of FT-Transformer trained on \gls{ISE} data with \gls{FS} option. The left plot contains attention weights of \num{16} trades at the quotes and the right plot of \num{16} midspread trades. Each column represents a trade and each row represents a feature. The intensity of the pixel represents the importance. $\mathtt{[CLS]}$ token excluded, as suggested in \textcite[][4]{cheferGenericAttentionmodelExplainability2021}. The green area marks a trade, that was correctly classified by the network. Details on the trade are given below.}
+    \caption[Attention Maps of FT-Transformer]{Attention maps of FT-Transformer trained on \gls{ISE} data with \gls{FS} option. The left plot contains attention weights of \num{16} trades at the quotes and the right plot of \num{16} midspread trades. Each column represents a trade and each row represents a feature. The intensity of the pixel represents the importance. $\mathtt{[CLS]}$ token excluded, as suggested in \textcite[\checkmark][400]{cheferGenericAttentionmodelExplainability2021}. The green area marks a trade, that was correctly classified by the network. Details on the trade are provided below.}
     \label{fig:attention-maps-ise}
 \end{figure}
 
-Visually, the trade price and quotes at the exchange or inter-exchange level are important and frequently used. This aligns with theory, as these features are core to the quote rule and numerous hybrid algorithms. Also, quote-based algorithms are among the best performing in our dataset. Aside from the trade price, features required to estimate the tick rule attain only spurious attributions. Considering the devastating performance of tick-based algorithms in option trade classification, this is unsurprising. Features from the depth and trade size rule, such as the trade size, are used selectively for trades at the quotes. In this subset, option-specific features like the issue type, moneyness, time to maturity, or daily trading volume of the option series receive relatively high attention scores.  Overall, engineered features, like the proximity to quotes, attain low attention scores, which suggests that the Transformer itself can synthesize the feature from the \emph{raw} bid, ask, and trade price.
+Visually, the trade price and quotes at the exchange or inter-exchange level are important and frequently used. Also, quote-based algorithms are among the best performing in our dataset. Aside from the trade price, features required to estimate the tick rule attain only spurious attributions. Features from the depth and trade size rule, such as the trade size, are used selectively for trades at the quotes. In this subset, option-specific features like the issue type, moneyness, time to maturity, or daily trading volume of the option series receive relatively high attention scores.  Overall, engineered features, like the proximity to quotes, attain low attention scores, which suggests that the Transformer itself can synthesize the feature from the \emph{raw} bid, ask, and trade price.
 
-The model assigns higher attention scores to features present in rule-based algorithms. Due to the possible link to rule-based trade classification, it is worthwhile to explore, if the fine-grained patterns learned by specific attention heads translate to existing trade classification rules i.e., if specific tokens attend to features that are jointly used in rule-based classification. This information is sacrificed when aggregating over multiple attention heads and layers, as done for \cref{fig:attention-maps-ise}, but readily available from individual attention heads. To analyze this further, we adapt the approach of \textcite[][4]{clarkWhatDoesBERT2019} to our context and probe individual attention heads.
+The model assigns high attention scores to features from rule-based algorithms. Due to a possible link to rule-based classification, we explore, if the fine-grained patterns learned by attention heads individually translate to existing trade classification rules i.e., if specific tokens attend to features that are jointly used in rule-based classification. This information is sacrificed when aggregating over multiple attention heads and layers, as done for \cref{fig:attention-maps-ise}.
 
 \begin{figure}[h!]
     \subfloat[Tick Rule-like Head (3,5)\label{fig:head-tick}]{\includegraphics[width=0.3\textwidth]{attention_head_5_layer_3_color_green_ise_quotes_mid.pdf}}
@@ -761,43 +783,39 @@ \subsection{Feature Importance}\label{sec:feature-importance}
     \subfloat[Trade Size Rule-like Head (3,8)\label{fig:head-tsize}]{\includegraphics[width=0.3\textwidth]{attention_head_8_layer_3_color_green_ise_quotes_mid.pdf}}
     \hfill
     \subfloat[\glsentryshort{LR}-Like Head (4,8)\label{fig:head-lr}]{\includegraphics[width=0.3\textwidth]{attention_head_8_layer_4_color_green_ise_quotes_mid.pdf}}
-    \caption[Rule-like Roles of Selected Attention Heads]{Attention heads that correspond to trade classification rules. Tuple denotes the location of the attention head in the model in the form of (layer, head). The intensity of the line represents the strength of attention weight. Attentions are only visualized for the $\mathtt{[CLS]}$ token. The model is trained on \gls{ISE} data. Visualizations based on code by \textcite[][4]{clarkWhatDoesBERT2019}.}
+    \caption[Rule-like Roles of Selected Attention Heads]{Attention heads that correspond to trade classification rules. Tuple denotes the location of the attention head in the model in the form of (layer, head). The intensity of the line represents the strength of attention weight. Attentions are only visualized for the $\mathtt{[CLS]}$ token. The model is trained on \gls{ISE} data. Visualizations based on code by \textcite[\checkmark][279]{clarkWhatDoesBERT2019}.}
     \label{fig:rule-like-attention-heads}
 \end{figure}
 
 We study attention weights of one specific trade in detail, which is marked in green in \cref{fig:attention-maps-ise}. The trade has the following properties: trade price \SI{3.5}[\$]{}, trade size \SI{5}[]{} contracts, ask at exchange, \SI{3.85}[\$]{}, bid at exchange \SI{3.5}[\$]{}, ask size \SI{11}[]{} contracts, and bid size \SI{10}[]{} contracts, classified as sell. \cref{fig:rule-like-attention-heads} depicts the result for selected attention heads involved in classifying the specific trade. The remaining attention heads are visualized in \cref{app:attention-heads-of-transformer}. Each subplot depicts the features to which the classification token $\mathtt{[CLS]}$ attends too. The attention weight determines the intensity of the line between the two. 
 
-Referring to the results from the appendix, we note that attention heads learn diverse patterns, as most heads attend to different tokens at once learning different relations. However, certain heads exhibit redundancy. For earlier layers in the network, the classification tokens gather from multiple tokens with uniform attention weights, whereas for the final self-attention layers, attention heads specialize in relations that seem related to rule-based trade classification. \cref{fig:head-tick} depicts a classification head that focuses solely on the change in trade price akin to the tick rule. In \cref{fig:head-tsize} the classification token in the neighboring head gathers simultaneously from multiple size-related features similar to the trade size rule. Finally, \cref{fig:head-lr} is alike to the \gls{LR} algorithm with additional dependencies on the moneyness. For other attention heads the purpose they serve in the network remains open. 
-
-The redundancy between attention heads is possibly explained by the use of attention dropout in our networks (cp. \cref{sec:hyperparameter-tuning}), which randomly deactivates units of the network during training and forces the network to learn redundant representations. A similar point is made by \textcite[][8--9]{clarkWhatDoesBERT2019} for the related \gls{BERT} model. Our finding of uniform attention weights in earlier layers of the network is consistent with the of \textcite[][4]{abnarQuantifyingAttentionFlow2020} made for \gls{BERT}.
+Referring to the results from the Appendix, we note that attention heads learn diverse patterns, as most heads attend to different tokens at once learning different relations. However, certain heads exhibit redundancy. For earlier layers in the network, the classification tokens gather from multiple tokens with uniform attention weights, whereas for the final self-attention layers, attention heads specialize in relations that seem related to rule-based trade classification. \cref{fig:head-tick} depicts a classification head that focuses solely on the change in trade price akin to the tick rule. In \cref{fig:head-tsize} the classification token in the neighboring head gathers simultaneously from multiple size-related features similar to the trade size rule. Finally, \cref{fig:head-lr} is alike to the \gls{LR} algorithm with additional dependencies on the moneyness. For other attention heads the purpose they serve in the network cannot be mapped. 
 
-When repeated for other trades, the identified roles of the attention heads are partially retained, but it is important to highlight that a more comprehensive analysis is required. We suggest revisiting this topic in future research as it potentially enables uncovering new rule-based approaches and understanding Transformer-based trade classification in more detail.
+When repeated for other trades, the identified roles of the attention heads are partially retained. We intend to investigate this result in future work in a more comprehensive analysis.
 
 \textbf{Embedding Visualization}
 
-For the Transformer we know from \cref{sec:token-embeddings}, that embeddings can capture similarities by arranging related objects closer in embedding space. Visualizing the learned embeddings enables insights into the model.
-
-The embeddings are queried from the feature tokenizer in FT-Transformer. The similarity between embeddings is measured by cosine distance in embedding space. The high-dimensional embeddings are then projected into 2D space using $t$-SNE \autocite[][2587]{vandermaatenVisualizingDataUsing2008}. As straightforward to interpret, we focus our analysis on the root, but note, that it applies to any numerical and categorical embeddings.
+For Transformers we know from \cref{sec:token-embeddings}, that embeddings can capture similarities by arranging related objects closer in embedding space. Visualizing the learned embeddings enables insights into the learned patterns.
 
-\cref{fig:categorical-embeddings} illustrates the embeddings exemplary for SPDR S\&P 500 Trust ($\mathtt{SPY}$) and JPMorgan Chase \& Co ($\mathtt{JPM}$) which can be \emph{qualitatively} interpreted.\footnote{As our analysis is condensed to two randomly chosen examples, we encourage the reader to use our interactive visualization for further exploration. Accessible here \url{https://projector.tensorflow.org/?config=https://raw.githubusercontent.com/KarelZe/Embeddings/main/embedding_projector.config.json}.}
+The embeddings are queried from the feature tokenizer in FT-Transformer. The similarity between embeddings is measured by cosine distance in embedding space. The high-dimensional embeddings are then projected into 2D space using $t$-SNE \autocite[\checkmark][2587]{vandermaatenVisualizingDataUsing2008}. As straightforward to interpret, we focus our analysis on the root, but note, that it applies to any numerical and categorical embeddings. \cref{fig:categorical-embeddings} illustrates the embeddings exemplary for SPDR S\&P 500 Trust ($\mathtt{SPY}$) and JPMorgan Chase \& Co ($\mathtt{JPM}$).\footnote{As our analysis is condensed to two randomly chosen examples, we encourage the reader to use our interactive visualization for further exploration. Accessible here \url{https://projector.tensorflow.org/?config=https://raw.githubusercontent.com/KarelZe/Embeddings/main/embedding_projector.config.json}.}
 
 \begin{figure}[h!]
     \subfloat[Most Similar Embeddings to $\mathtt{SPY}$\label{fig:cat-embeddings-spy}]{\includegraphics[width=0.6\textwidth]{categorical_embeddings_SPY.pdf}}
     \vfill
     \subfloat[Most Similar Embeddings to $\mathtt{JPM}$\label{fig:cat-embeddings-jpm}]{\includegraphics[width=0.6\textwidth]{categorical_embeddings_JPM.pdf}}
-    \caption[Embeddings of Selected Underlyings]{Embeddings of selected underlyings. The plot depicts the projected embedding of SPDR S\&P 500 ETF ($\mathtt{SPY}$) and JPMorgan Chase \& Co ($\mathtt{JPM}$) and their most similar embeddings. Embeddings are projected into 2D-space using $t$-SNE. The ten most similar embeddings by cosine distance in the original space are colored and annotated. The model is trained on \gls{ISE} data.}
+    \caption[Embeddings of Selected Underlyings]{Embeddings of selected underlyings. The plot depicts the projected embedding of SPDR S\&P 500 ETF ($\mathtt{SPY}$) and JPMorgan Chase \& Co ($\mathtt{JPM}$) and their most similar embeddings. Embeddings are projected into 2D space using $t$-SNE. The ten most similar embeddings by cosine distance in the original space are colored and annotated. The model is trained on \gls{ISE} data.}
     \label{fig:categorical-embeddings}
 \end{figure}
 
-For SPDR S\&P 500 ETF ($\mathtt{SPY}$) in \cref{fig:cat-embeddings-spy}, similar embeddings include iShares Russell 2000 ($\mathtt{IWM}$), iShares Russel 2000 ($\mathtt{WYV}$), SPDR S\&P 500 ETF ($\mathtt{OBV}$), and Direxion shares ETF ($\mathtt{XDT}$). This aligns with the intuition that \glspl{ETF} track identical or related indices. The model distinguishes \glspl{ETF} from other securities based on the feature issue type. Additional similar embeddings consist of Citigroup Inc. ($\mathtt{WRV}$), Kohl's Corp. ($\mathtt{OSS}$), Google Inc. ($\mathtt{YTM}$), and Intel Corp. ($\mathtt{INTC}$), which are long-term index constituents.
+For SPDR S\&P 500 ETF ($\mathtt{SPY}$) in \cref{fig:cat-embeddings-spy}, similar embeddings include iShares Russell 2000 ($\mathtt{IWM}$), iShares Russel 2000 ($\mathtt{WYV}$), SPDR S\&P 500 ETF ($\mathtt{OBV}$), and Direxion shares ETF ($\mathtt{XDT}$), which track identical or related indices. Additional similar embeddings consist of Citigroup Inc. ($\mathtt{WRV}$), Kohl's Corp. ($\mathtt{OSS}$), Google Inc. ($\mathtt{YTM}$), and Intel Corp. ($\mathtt{INTC}$), which are index constituents.
 
-Regarding JPMorgan Chase \& Co. ($\mathtt{JPM}$) in \cref{fig:cat-embeddings-jpm}, the most similar embedding is the of Bank of America ($\mathtt{BAC}$). Other similar embeddings include financial service providers like Amerigroup ($\mathtt{XGZ}$) and Janus Henderson Group ($\mathtt{ZPR}$). These results suggest that the model learned to group US financials, even without sector information provided. However, this argumentation does not apply to other related embeddings such as the Apollo Group ($\mathtt{OKO}$) or United Parcel Service of America ($\mathtt{YUP}$). % Autodesk Inc. ($\mathtt{ADQ}$) , Centex Corp. ($\mathtt{YYV}$), United Parcel Service of America ($\mathtt{YUP}$), Wild Oats Markets ($\mathtt{ZAC}$), SPDR S\&P 500 ETF ($\mathtt{SUE}$), and SPDR Dow Jones Industrial Average ($\mathtt{DIA}$).
+Regarding JPMorgan Chase \& Co. ($\mathtt{JPM}$) in \cref{fig:cat-embeddings-jpm}, the most similar embedding is the of Bank of America ($\mathtt{BAC}$). Other similar embeddings include financial service providers like Amerigroup ($\mathtt{XGZ}$) and Janus Henderson Group ($\mathtt{ZPR}$). Here, the model learns to group US financials, even without sector information provided. This argumentation does not apply to other related embeddings such as the Apollo Group ($\mathtt{OKO}$) or United Parcel Service of America ($\mathtt{YUP}$). 
 
-While these exemplary results indicate that the model can learn meaningful representations of the underlying, we must acknowledge its limitations. Both underlyings are frequently traded in our dataset, which may lead to meaningful embeddings. For infrequent underlyings, embeddings are likely to be close to their random initialization and lack meaningful patterns due to limited parameter updates and missing context. This issue is analogous to handling rare vocabulary items found in natural language processing. As the underlying plays a subordinate role in classification, this caveat is accepted.
+Results suggest that the model can learn meaningful representations of the underlying. However, limitations exist. Both underlyings are frequently traded and prevalent in our dataset, for infrequent underlyings embeddings are likely to be close to their random initialization and lack meaningful patterns due to limited parameter updates and missing context. This issue is analogous to handling rare vocabulary items found in language processing.
 
 \textbf{SAGE Values}
 
-We compare the feature importances of rule-based and machine learning-based classifiers using \gls{SAGE}, which offers a clear interpretation of each feature's contribution to the prediction. The zero-one loss is chosen as a loss function, which is appealing due to the direct link to accuracy. Based on the distribution of the \gls{ISE} test set, a na\"ive prediction of the majority class yields an accuracy of \SI{51.4027}{\percent} or a zero-one loss of $1- \num{0.514027} = \num{0.485973}$. \gls{SAGE} attributes the outperformance of machine learning or rule-based classifiers over the na\"ive prediction to the features based on Shapley values. The sum of all \gls{SAGE} values for a given predictor represents the difference in loss compared to the na\"ive classification.
+We compare the feature importances of rule-based and machine learning-based classifiers using \gls{SAGE} with zero-one loss. Based on the distribution of the \gls{ISE} test set, a na\"ive prediction of the majority class yields an accuracy of \SI{51.4027}{\percent} or a zero-one loss of $1- \num{0.514027} = \num{0.485973}$. \gls{SAGE} attributes the outperformance of machine learning or rule-based classifiers over the na\"ive prediction to the features based on the Shapley values. For interpretation, the sum of all \gls{SAGE} values for a given predictor represents the difference in loss compared to the na\"ive classification. We group features to hide implementation details in classical rules.
 
 \begin{figure}[h!]
     \centering
@@ -806,10 +824,23 @@ \subsection{Feature Importance}\label{sec:feature-importance}
     \label{fig:sage-importances}
 \end{figure}
 
-From \cref{fig:sage-importances} that all models achieve the largest improvement in loss from quoted prices and if provided from the quoted sizes. The contribution of the \gls{NBBO} to performance is roughly equal for all models, suggesting that even simple heuristics effectively exploit the data. For machine learning-based predictors, quotes at the exchange level hold equal importance in classification. This contrast with \gls{GSU} methods, which rely less on exchange-level quotes and mostly classify trades based on upstream rules. The performance improvements from the trade size and quoted size, are slightly lower for rule-based methods compared to machine learning-based methods.  Transformers and \glspl{GBRT} gain performance from the addition of option features, i.e., moneyness and time-to-maturity. In conjunction with the results from the robustness checks, this suggests that the improvements observed for long-running options or \gls{ITM} options are directly linked to the moneyness or time to maturity of the traded option itself. However, it remains unclear how these features interact with others. Regardless of the method used, changes in trade price before or after the trade are irrelevant for classification and can even harm performance. Similarly, additional features such as option type, issue type, the trading volume of the option series, and the underlying are also irrelevant. Thus, we note that there is a significant overlap between the importance of features in classical trade classification rules and machine learning-based predictors.
+From \cref{fig:sage-importances} we see that all models achieve the largest improvement in loss from quoted prices and if provided from the quoted sizes. The contribution of the \gls{NBBO} to performance is roughly equal for all models, suggesting that even simple heuristics effectively exploit the data. For machine learning-based predictors, quotes at the exchange level hold equal importance in classification. This contrasts with \gls{GSU} methods, which rely less on exchange-level quotes and mostly classify trades based on top-level rules. 
+
+The performance improvements from the trade size and quoted size, are slightly lower for rule-based methods compared to machine learning-based methods.  Transformers and \glspl{GBRT} gain performance from the addition of option features, i.e., moneyness and time-to-maturity. 
+
+Regardless of the method used, changes in trade price before or after the trade are irrelevant for classification and can even harm performance. Similarly, additional features such as option type, issue type, the trading volume of the option series, and the underlying are also irrelevant. Overall, we observe a large overlap between the importance of features in classical trade classification rules and machine learning-based predictors.
 
-\todo{Importance of Moneyness and Time-to-Maturity}
+\subsubsection{Feature Importance Discussion}\label{sec:feature-importance-discussion}
+
+The redundancy between attention heads is possibly due to the attention dropout in our networks (cp. \cref{sec:hyperparameter-tuning}), which randomly deactivates units of the network during training and forces the network to learn redundant representations. A similar point is made by \textcite[\checkmark][283--284]{clarkWhatDoesBERT2019} for the related \gls{BERT} model. Our finding of uniform attention weights in earlier layers of the network is consistent with the observation of \textcite[\checkmark][4193]{abnarQuantifyingAttentionFlow2020} made for \gls{BERT}.
+
+In conjunction with the results from the robustness checks, this suggests that the improvements observed for long-running options or \gls{ITM} options are directly linked to the moneyness or time to maturity of the traded option itself. However, it remains unclear how these features interact with others.
+\todo{Importance of Moneyness and Time-to-Maturity. How do these results fit into a broader picture?}
 \todo{Distribution in Sample: TTM, Trade Size, Moneyness}
+\todo{Transformer-based models (Vaswani et al., 2017), analyses of attention weights have shown interpretable patterns in their structure (Coenen et al., 2019; Vig and Belinkov, 2019; Voita et al., 2019b; Hoover et al., 2019) and found strong correlations to syntax (Clark et al., 2019). However, other studies have also cast doubt on what conclusions can be drawn from attention patterns (Jain and Wallace, 2019; Serrano and Smith, 2019; Brunner et al., 2019). (found in merchant)}
+Considering the devastating performance of tick-based algorithms in option trade classification, this is unsurprising.
+
+This aligns with theory, as these features are core to the quote rule and numerous hybrid algorithms. 
 
 \clearpage
 
@@ -817,29 +848,28 @@ \section{Application in Transaction Cost Estimation}\label{sec:application}
 
 \textbf{Preliminaries}
 
-Albeit the classification accuracy is a reasonable measure for comparing classifiers, one cannot immediately infer how changes in accuracy e.~g., an improvement by \SI{1}{\percent}, affect the application domains. In an attempt to make our results tangible, we apply all algorithms to estimate trading cost, a problem we previously identified to be reliant on correct trade classification (cp. \cref{sec:introduction}) and a common testing ground for trade classification rules \autocites[cp.][541]{ellisAccuracyTradeClassification2000}[][569]{finucaneDirectTestMethods2000}[][271--278]{petersonEvaluationBiasesExecution2003}[][896--897]{savickasInferringDirectionOption2003}.
+Albeit the classification accuracy is a reasonable measure for comparing classifiers, one cannot immediately infer how changes in accuracy e.~g., an improvement by \SI{1}{\percent}, affect the application domains. In an attempt to make our results tangible, we apply all algorithms to estimate trading cost, a problem we previously identified to be reliant on correct trade classification (cp. \cref{sec:introduction}) and a common testing ground for trade classification rules \autocites[cp.][540--541]{ellisAccuracyTradeClassification2000}[\checkmark][569--570]{finucaneDirectTestMethods2000}[\checkmark][271--278]{petersonEvaluationBiasesExecution2003}[\checkmark][896--897]{savickasInferringDirectionOption2003}.
 
-One of the most widely adopted measures for trading costs is the effective spread \autocite[][112]{Piwowar_2006}. It is defined as the difference between the trade price and the fundamental value of the asset \autocite[][238--239]{bessembinderIssuesAssessingTrade2003}. Following \textcite[][238--239]{bessembinderIssuesAssessingTrade2003}, we define the \emph{nominal, effective spread} as
+One of the most widely adopted measures for trading costs is the effective spread \autocite[\checkmark][112]{Piwowar_2006}. It is defined as the difference between the trade price and the fundamental value of the asset \autocite[\checkmark][238--239]{bessembinderIssuesAssessingTrade2003}. Following \textcite[\checkmark][238--239]{bessembinderIssuesAssessingTrade2003}, we define the \emph{nominal, effective spread} as
 \begin{equation}
     S_{i,t} = 2 (P_{i,t} - V_{i,t}) D_{i,t}.
     \label{eq:effective-spread}
 \end{equation}
 
-Like before, $i$ indexes the security and $t$ the point in time. Here, $D_{i,t}$ is the trade direction, which is either $1$ for customer buy orders and $-1$ for sell orders. If the trade initiator is known, we set $D_{i,t} = y_{i,t}$ and $D_{i,t}=\hat{y}_{it}$, if inferred from a rule or classifier. As the fundamental value $V_{i,t}$ is unobserved at the time of the trade, we follow a common track in research and use the midpoint of the prevailing quotes as an observable proxy.\footnote{An alternative treatment for options is discussed in \textcite[][4975--4976]{muravyevOptionsTradingCosts2020} Our focus is on the midspread, as it is the most common proxy for the value.} This is also a natural choice, under the assumption that, on average, the spread is symmetric and centered around the true fundamental value \autocite[][1018]{leeMarketIntegrationPrice1993}. We multiply the so-obtained half-spread by $2 \times$ to obtain the effective spread, which represents the cost for a round trip trade involving a buy and sell excluding commissions.
+Like before, $i$ indexes the security and $t$ the point in time. Here, $D_{i,t}$ is the trade direction, which is either $1$ for customer buy orders and $-1$ for sell orders. If the trade initiator is known, we set $D_{i,t} = y_{i,t}$ and $D_{i,t}=\hat{y}_{it}$, if inferred from a rule or classifier. As the fundamental value $V_{i,t}$ is unobserved at the time of the trade, we follow a common track in research and use the midpoint of the prevailing quotes as an observable proxy.\footnote{An alternative treatment for options is discussed in \textcite[\checkmark][4975--4976]{muravyevOptionsTradingCosts2020} Our focus is on the midspread, as it is the most common proxy for the value.} This is also a natural choice, under the assumption that, on average, the spread is symmetric and centered around the true fundamental value \autocite[\checkmark][1018]{leeMarketIntegrationPrice1993}. We multiply the so-obtained half-spread by $2 \times$ to obtain the effective spread, which represents the cost for a round trip trade involving a buy and sell excluding commissions.
 
-Apparent from \cref{eq:effective-spread}, poor estimates for the predicted trade direction, lead to an under or overestimated effective spread, and hence to a skewed trade cost estimate. Only for trades at the midspread, the predicted trade direction is irrelevant, since the effective spread is zero. By comparing the true effective spread from the estimated, we can derive the economic significance. A classifier correctly classifying every trade, achieves an effective spread estimate equal to the true spread. For a random classifier, the effective spread is around zero, as misclassification estimates the spread with the opposite sign, which offsets with correct, random estimates for other trades.
+Apparent from \cref{eq:effective-spread}, poor estimates for the predicted trade direction, lead to an under or overestimated effective spread, and hence to a skewed trade cost estimate. Only for trades at the midspread, the predicted trade direction is irrelevant, since the effective spread is zero. By comparing the true effective spread from the estimated, we can derive the economic significance. A classifier correctly classifying every trade, achieves an effective spread estimate equal to the true spread. For a random classifier, the effective spread is around zero, as misclassification estimates the spread with the opposite sign, which offsets with other correct, random estimates.
 
 For convenience, we also calculate the \emph{relative effective spread} as
 \begin{equation}
     {PS}_{i,t} = S_{i,t} / V_{i,t}.
 \end{equation}
-% \todo{check how it is defined Savickas / Finucane use midpoint, Peterson and Sirri divide by price / so does Chakrabarty 2007 p. 3819?}
 
-Adapted from \textcite[][12]{theissenTestAccuracyLee2000} a Wilcoxon test is conducted to assess if the medians of the estimated, effective spread and the true effective spread are equal.
+Adapted from \textcite[\checkmark][158]{theissenTestAccuracyLee2001} a Wilcoxon test is conducted to assess if the medians of the estimated, effective spread and the true effective spread are equal.
 
 \textbf{Results}
 
-The true and the estimated effective spreads for the test sets are shown in the \cref{tab:effective-spread} aggregated by mean. \textcite[][896--897]{savickasInferringDirectionOption2003} estimated the effective spreads of rules on an older subset of option trades at the \gls{CBOE}, which can be compared against. Our results match theirs in magnitude.
+The true and the estimated effective spreads for the test sets are shown in the \cref{tab:effective-spread} aggregated by mean. \textcite[\checkmark][896--897]{savickasInferringDirectionOption2003} estimated the effective spreads of rules on an older subset of option trades at the \gls{CBOE}, which can be compared against. Our results match theirs in magnitude.
 
 \begin{table}[!ht]
     \centering
@@ -848,14 +878,19 @@ \section{Application in Transaction Cost Estimation}\label{sec:application}
     \label{tab:effective-spread}
 \end{table}
 
-In summary, quote-based algorithms like the quote rule and the \gls{LR} algorithm severely overestimate the effective spread. The overestimate is less severe for the \gls{CLNV} algorithm due to stronger dependency on the tick rule. The tick rule itself achieves estimates closest to the true effective spread, which is \num[round-mode=places, round-precision=3]{0.004926}[\$]{} and \num[round-mode=places, round-precision=3]{0.012219}[\$]{} for the \gls{ISE} and \gls{CBOE} sample respectively. As primarily tick-based algorithms, like the tick rule or \gls{EMO} rule, act as a random classifier in our samples, we conclude that the close estimate is an artifact of randomness, not due to superior predictive power. This observation is in line with \textcite[][897]{savickasInferringDirectionOption2003}, who make a similar argument for the \gls{EMO} rule on \gls{CBOE} trades. For rule-based algorithms $\operatorname{gsu}_{\mathrm{large}}$ provides reasonable estimates of the effective spread while achieving high classification accuracy.
+In summary, quote-based algorithms like the quote rule and the \gls{LR} algorithm severely overestimate the effective spread. The overestimate is less severe for the \gls{CLNV} algorithm due to stronger dependency on the tick rule. The tick rule itself achieves estimates closest to the true effective spread, which is \num[round-mode=places, round-precision=3]{0.004926}[\$]{} and \num[round-mode=places, round-precision=3]{0.012219}[\$]{} for the \gls{ISE} and \gls{CBOE} sample respectively. As primarily tick-based algorithms, like the tick rule or \gls{EMO} rule, act as a random classifier in our samples, we conclude that the close estimate is an artifact of randomness, not due to superior predictive power. This observation is in line with \textcite[\checkmark][897]{savickasInferringDirectionOption2003}, who make a similar argument for the \gls{EMO} rule on \gls{CBOE} trades. For rule-based algorithms \gls{GSU} (large) provides reasonable estimates of the effective spread while achieving high classification accuracy.
 
 From our supervised classifiers the FT-Transformer or \gls{GBRT} trained on \gls{FS} option provides estimates closest to the true effective spread, in particular on the \gls{CBOE} sample. For semi-supervised classifiers, Transformer-based models approximate the true effective spread best. This best manifests in a predicted effective spread at the \gls{ISE} of \SI[round-mode=places, round-precision=3]{0.013118}[\$]{} versus \SI[round-mode=places, round-precision=3]{0.004926}[\$]{}. The null hypothesis of equal medians is rejected at the \SI{1}{\percent} level for all classifiers.
 
 Thus, \gls{GSU} method (large) provides the best estimate of the effective spread if the true labels are absent. For labeled data, Transformer or gradient boosting-based approaches can provide more accurate estimates. The de facto standard, the \gls{LR} algorithm, fails to deliver accurate estimates and may bias research.
 
+\todo{my results are smaller than in \textcites[][]{muravyevOptionsTradingCosts2020}{savickasInferringDirectionOption2003}{kaeckPriceImpactBid2022}. Might have to do with moneyness but verify root cause.}
 \todo{“In addition, my results offer little help in answering why option bid-ask spreads are so large. This is one of the biggest puzzles in the options literature—existing theories of the option spread fail to explain its magnitude and shape (Muravyev and Pearson (2014)).”}
-\todo{compare against \textcite[][4981]{muravyevOptionsTradingCosts2020} or \autocite{kaeckPriceImpactBid2022}}
+\todo{compare against \textcite[\checkmark][4981]{muravyevOptionsTradingCosts2020} or \autocite{kaeckPriceImpactBid2022}}
 \todo{Think about reporting as a percentage. Adjust the formula from above.}
 \todo{Look into \textcite{muravyevOptionsTradingCosts2020}}
 \todo{Options listed on multiple exchanges have narrower spreads than those listed on a single exchange, but the difference diminishes as option volume increases. Option spreads become wider when a competing exchange delists the option.\autocite{mayhewCompetitionMarketStructure2002}}
+\todo{“The nature of option markets is such that percentage and dollar spreads are not easily comparable between options with different degrees of moneyness. In particular, deep out-of-the-money options have a low value and therefore the percentage spread is typically large compared to its dollar spread. The opposite holds for deep in-the-money options. Further, we show both value-weighted and equal-weighted spreads, because the former overweights high-priced ITM options and the latter overweights low-priced OTM options.” (Kaeck et al)}
+\todo{“he equal-weighted average effective quoted half spread, i.e., the quoted half-spread just before each trade, is 12.8 prozent, while the effective spread is only 5.6 pronzent. The large difference is explained by the many trades negotiated off-exchange or directly with dealers that occur inside the quoted bid and ask prices. The dollar value-weighted percentage spreads are about one-third of the equal-weighted values, reflecting that high-priced ITM options have relatively small percentage spreads.” (Kaek et al)}
+\todo{Explain what Muravyev do differently. “They replace the midpoint by the predicted value of a regression of the midpoint on Black–Scholes price minus midpoint, delta times lagged underlying price differences, and lagged price changes. This adjustment offers a better expectation of the unobserved fundamental value than the quoted midpoint.” (Kaeck et al)}
+
diff --git a/reports/Content/rule-approaches.tex b/reports/Content/rule-approaches.tex
index 74384942..b105f717 100644
--- a/reports/Content/rule-approaches.tex
+++ b/reports/Content/rule-approaches.tex
@@ -5,24 +5,22 @@ \section{Rule-Based Approaches}\label{sec:rule-based-approaches}
 \subsection{Trade Initiator}
 \label{sec:trade-initiator}
 
-Various definitions for the trade initiator have been proposed in prior
+Competing definitions for the trade initiator have been proposed in prior
 research. Among these, the:
 
-\emph{Chronological view:} \textcite[][267]{odders-whiteOccurrenceConsequencesInaccurate2000} adapts a chronological view based on the order arrival. She defines the initiator of the trade as the party (buyer or seller) who places their order last, chronologically. This definition requires knowledge about the order submission times.
+\emph{Chronological view:} \textcite[\checkmark][262]{odders-whiteOccurrenceConsequencesInaccurate2000} adapts a chronological view based on the order arrival. She defines the initiator of the trade as the party (buyer or seller) who places their order last, chronologically. This definition requires knowledge about the order submission times.
 
-\emph{Immediacy view:} In contrast, \textcite[][94--97]{leeInferringInvestorBehavior2000} equate the trade initiator with the party in demand for immediate execution. Thus, traders placing market orders, immediately executable at whatever price, or executable limit orders, are considered the trade initiator. By contrast, the party placing non-executable limit orders, which may not even result in a trade, is the non-initiator. This definition remains ambiguous for trades resulting from crossed limit orders, matched market orders, or batched orders \autocite[][94--95]{leeInferringInvestorBehavior2000}.
+\emph{Immediacy view:} In contrast, \textcite[\checkmark][94--97]{leeInferringInvestorBehavior2000} equate the trade initiator with the party in demand for immediate execution. Thus, traders placing market orders, immediately executable at whatever price, or executable limit orders, are considered the trade initiator. By contrast, the party placing non-executable limit orders, which may not even result in a trade, is the non-initiator. This definition remains ambiguous for trades resulting from crossed limit orders, matched market orders, or batched orders \autocite[\checkmark][94--95]{leeInferringInvestorBehavior2000}.
 % FIXME: introduce of the notion of demanding/taking away liquidity and providing liquidity
 
-\emph{Positional view:} Independent from the order type and submission time, \textcite[][533]{ellisAccuracyTradeClassification2000} deduce their definition of the trade initiator based on the position of the involved parties opposite to the market maker or broker. The assumption is, that the market maker or broker only provides liquidity to the investor and the trade would not exist without the initial investor's demand.
+\emph{Positional view:} Independent from the order type and submission time, \textcite[\checkmark][533]{ellisAccuracyTradeClassification2000} deduce their definition of the trade initiator based on the position of the involved parties opposite to the market maker or broker. The assumption is, that the market maker or broker only provides liquidity to the investor and the trade would not exist without the initial investor's demand.
 
 The appropriate view differs by data availability i.e., if the order type or submission type can be inferred from data and the application context. Regardless of the definition used, the trade initiator is binary and can either be the seller or the buyer. Henceforth, we denote it by $\gls{y} \in \mathcal{Y}$ with $\mathcal{Y}=\{-1,1\}$, with $y=-1$ indicating a seller-initiated and $y=1$ a buyer-initiated trade. The predicted trade initiator is distinguished by $\hat{y}$.
 
-In anticipation of \cref{sec:data-preprocessing}, we adopt a customer's position-based view in relation to the market maker. Nevertheless, the concepts presented in this thesis can be applied to other perspectives as well.
+In anticipation of \cref{sec:data-collection}, we adopt a customer's position-based view in relation to the market maker. Nevertheless, the concepts presented in this thesis can be applied to other perspectives as well.
 % \todo{new word: “Second, since net buying of puts by customers is tantamount to net put selling by market makers, and such order flow positively predicts market returns, option market makers may have information relevant for predicting market returns.” found in \textcite[][2]{chordiaIndexOptionTrading2021}}
 
-\todo{new word: “Second, since net buying of puts by customers is tantamount to net put selling by market makers, and such order flow positively predicts market returns, option market makers may have information relevant for predicting market returns.” found in \textcite[][2]{chordiaIndexOptionTrading2021}}
-
-As the trade initiator is frequently absent in option datasets, it must be inferred using trade classification algorithms. The following section introduces basic rules for trade classification. We start with the ubiquitous quote and tick rule and continue with the more recent depth and trade size rule. Our focus is on classification rules, that sign trades on a trade-by-trade basis. Consequently, we omit classification rules for aggregated trades, like the \gls{BVC} algorithm of \textcite[][1466--1468]{easleyFlowToxicityLiquidity2012}.
+As the trade initiator is frequently absent in option datasets, it must be inferred using trade classification algorithms. The following section introduces basic rules for trade classification. We start with the ubiquitous quote and tick rule and continue with the more recent depth and trade size rule. Our focus is on classification rules, that sign trades on a trade-by-trade basis. Consequently, we omit classification rules for aggregated trades, like the \gls{BVC} algorithm of \textcite[\checkmark][1466--1468]{easleyFlowToxicityLiquidity2012}.
 
 \subsection{Basic Rules}\label{sec:basic-rules}
 
@@ -30,7 +28,7 @@ \subsection{Basic Rules}\label{sec:basic-rules}
 
 \subsubsection{Quote Rule}\label{sec:quote-rule}
 
-The quote rule follows the rationale, that market makers provide quotes, against which buyers or sellers trade. It classifies a trade by comparing the trade price against the corresponding quotes at the time of the trade. We denote the sequence of trade prices of the $i$-th security by $(P_{i,t})_{t=1}^{T}$ and the corresponding ask at $t$ by $\gls{A}_{i,t}$ and bid by $\gls{B}_{i,t}$. If the trade price is above the midpoint of the bid-ask spread, estimated as $\gls{M}_{i,t} = \tfrac{1}{2}(B_{i,t} + A_{i,t})$, the trade is classified as a buy and if it is below the midpoint, as a sell \autocite[][41]{harrisDayEndTransactionPrice1989}.\footnote{For simplicity we assume an ideal data regime, where quote data is complete and spreads are positive.} Thus, the classification rule on $\mathcal{A} = \left\{(i, t) \in \mathbb{N}^2: P_{i,t} \neq M_{i,t}\right\}$ is given by:
+The quote rule follows the rationale, that market makers provide quotes, against which buyers or sellers trade. It classifies a trade by comparing the trade price against the corresponding quotes at the time of the trade. We denote the sequence of trade prices of the $i$-th security by $(P_{i,t})_{t=1}^{T}$ and the corresponding ask at $t$ by $\gls{A}_{i,t}$ and bid by $\gls{B}_{i,t}$. If the trade price is above the midpoint of the bid-ask spread, estimated as $\gls{M}_{i,t} = \tfrac{1}{2}(B_{i,t} + A_{i,t})$, the trade is classified as a buy and if it is below the midpoint, as a sell \autocite[\checkmark][41]{harrisDayEndTransactionPrice1989}.\footnote{For simplicity we assume an ideal data regime, where quote data is complete and spreads are positive.} Thus, the classification rule on $\mathcal{A} = \left\{(i, t) \in \mathbb{N}^2: P_{i,t} \neq M_{i,t}\right\}$ is given by:
 \begin{equation}
     \operatorname{quote}\colon \mathcal{A} \to \mathcal{Y},\quad
     \operatorname{quote}(i, t)=
@@ -43,11 +41,11 @@ \subsubsection{Quote Rule}\label{sec:quote-rule}
 
 As options are typically cross-listed on multiple exchanges, the quote rule can be estimated at the exchange level or on the \gls{NBBO}.
 
-\todo{“The structure of the U.S. options market is similar to that of the equity market but has some distinct features. Options are typically cross-listed across multiple fully electronic exchanges, and the NBBO rule is enforced. Investors can post limit or market orders, and market-makers are obliged to provide continuous two-sided quotes.” Make clear, what is the difference and why it matters.}
+% \todo{“The structure of the US options market is similar to that of the equity market but has some distinct features. Options are typically cross-listed across multiple fully electronic exchanges, and the NBBO rule is enforced. Investors can post limit or market orders, and market-makers are obliged to provide continuous two-sided quotes.” Make clear, what is the difference and why it matters.}
 
 \subsubsection{Tick Test}\label{sec:tick-test}
 
-A common alternative to the quote rule is the tick test. Based on the rationale that buys increase trade prices and sells lower them, the tick test classifies trades by the change in trade price. It was first applied in \textcites[][244]{holthausenEffectLargeBlock1987}[][240]{hasbrouckTradesQuotesInventories1988}. The tick test is defined as:
+A common alternative to the quote rule is the tick test. Based on the rationale that buys increase trade prices and sells lower them, the tick test classifies trades by the change in trade price. It was first applied in \textcites[\checkmark][244]{holthausenEffectLargeBlock1987}[\checkmark][240]{hasbrouckTradesQuotesInventories1988}. The tick test is defined as:
 \begin{equation}
     \operatorname{tick}\colon \mathbb{N}^2 \to \mathcal{Y},\quad
     \operatorname{tick}(i, t)=
@@ -59,11 +57,11 @@ \subsubsection{Tick Test}\label{sec:tick-test}
     \end{cases}
     \label{eq:tick-test}
 \end{equation}
-Considering the cases in \cref{eq:tick-test} the trade price is higher than the previous price (uptick) the trade is classified as a buy.\footnote{To end recursion at $t=1$, we sign the trades randomly as buyer- or seller-initiated to simplify notation. Other choices are possible. Similarly done for \cref{eq:reverse-tick-test}.} Reversely, if it is below the previous price (downtick), the trade is classified as a sell. If the price change is zero (zero tick), the signing uses the last price different from the current price \autocite[][735]{leeInferringTradeDirection1991}.
+Considering the cases in \cref{eq:tick-test} the trade price is higher than the previous price (uptick) the trade is classified as a buy.\footnote{To end recursion at $t=1$, we sign the trades randomly as buyer- or seller-initiated to simplify notation. Other choices are possible. Similarly done for \cref{eq:reverse-tick-test}.} Reversely, if it is below the previous price (downtick), the trade is classified as a sell. If the price change is zero (zero tick), the signing uses the last price different from the current price \autocite[\checkmark][735]{leeInferringTradeDirection1991}.
 
-By this means, the tick rule can sign all trades as long as a last differing trade price exists, but the overall precision can be impacted by infrequent trading. Being only dependent on transaction data makes the tick rule highly data-efficient. Waiving any quote data for classification contributes to this efficiency, but also poses a major limitation with regard to trades at the bid or ask, as discussed by \textcite[][557--558]{finucaneDirectTestMethods2000}. For instance, if quotes rise between trades, then a sale at the bid on an uptick or zero uptick is misclassified as a buy by the tick test due to the overall increased trade price. Similarly for falling quotes, buys at the ask on downticks or zero downticks are erroneously classified as a sell.
+By this means, the tick rule can sign all trades as long as a last differing trade price exists, but the overall precision can be impacted by infrequent trading. Being only dependent on transaction data makes the tick rule highly data-efficient. Waiving any quote data for classification contributes to this efficiency, but also poses a major limitation with regard to trades at the bid or ask, as discussed by \textcite[\checkmark][557--558]{finucaneDirectTestMethods2000}. For instance, if quotes rise between trades, then a sale at the bid on an uptick or zero uptick is misclassified as a buy by the tick test due to the overall increased trade price. Similarly for falling quotes, buys at the ask on downticks or zero downticks are erroneously classified as a sell.
 
-The reverse tick test is a variant of the tick test proposed in \textcite[][241]{hasbrouckTradesQuotesInventories1988}. It is similar to the tick rule but classifies based on the next, distinguishable trade price.
+The reverse tick test is a variant of the tick test proposed in \textcite[\checkmark][241]{hasbrouckTradesQuotesInventories1988}. It is similar to the tick rule but classifies based on the next, distinguishable trade price.
 
 \begin{equation}
     \operatorname{rtick} \colon \mathbb{N}^2 \to \mathcal{Y},\quad
@@ -76,18 +74,17 @@ \subsubsection{Tick Test}\label{sec:tick-test}
     \end{cases}
     \label{eq:reverse-tick-test}
 \end{equation}
-As denoted in \cref{eq:reverse-tick-test}, the trade is classified as seller-initiated, if the next trade is on an uptick or a zero uptick, and classified as buyer-initiated for trades at a downtick or a zero downtick \autocite[][735--636]{leeInferringTradeDirection1991}.
+As denoted in \cref{eq:reverse-tick-test}, the trade is classified as seller-initiated, if the next trade is on an uptick or a zero uptick, and classified as buyer-initiated for trades at a downtick or a zero downtick \autocite[\checkmark][735--736]{leeInferringTradeDirection1991}.
 
-Both tests result in the same classification, if the current trade is bracketed by a price reversal and the price change after the trade is opposite from the change before the trade, but differ for price continuations when price changes are in the same direction \autocite[][736]{leeInferringTradeDirection1991}.
+Both tests result in the same classification, if the current trade is bracketed by a price reversal and the price change after the trade is opposite from the change before the trade, but differ for price continuations when price changes are in the same direction \autocite[\checkmark][736]{leeInferringTradeDirection1991}.
 
-Tick tests can be estimated using the surrounding prices trading venue or at the inter-exchange level. The performance of the tick rules hinges on the availability of recent trade prices for inference. For infrequently traded assets this can pose a problem, as outdated prices might lose their relevancy in classification, as documented in \textcite[][568]{finucaneDirectTestMethods2000}.
-% similar in chakrabarty paper, but there seems to be a typo. Her sentence just makes no sense.
+Tick tests can be estimated using the surrounding prices trading venue or at the inter-exchange level. The performance of the tick rules hinges on the availability of recent trade prices for inference. For infrequently traded assets this can pose a problem, as outdated prices might lose their relevancy in classification, as documented in \textcite[\checkmark][568]{finucaneDirectTestMethods2000}.
 
 \subsubsection{Depth Rule}\label{sec:depth-rule}
 
 As \cref{sec:quote-rule} discusses, the quote rule necessitates alternative procedures for midspread trades. For midspread trades, \textcite[][14]{grauerOptionTradeClassification2022} propose the depth rule as a remedy.
 
-The depth rule gauges the trade initiator from the quoted size at the best bid and ask. Based on the observation that an exceeding bid or ask size relates to higher liquidity at one trade side, trades are classified as a buy (sell) for a larger ask (bid) size \autocite[][14]{grauerOptionTradeClassification2022}.
+The depth rule gauges the trade initiator from the quoted size at the best bid and ask. Based on the observation that an exceeding bid or ask size relates to higher liquidity at one trade side, trades are classified as a buy (sell) for a larger ask (bid) size \autocite[\checkmark][14--15]{grauerOptionTradeClassification2022}.
 
 Let $\gls{A-tilde}_{i,t}$ denote the quoted size of the ask, $\gls{B-tilde}_{i,t}$ of the bid, and $P_{i,t}$ the trade Price at $t$ of the $i$-th option. We set the domain as $\mathcal{A} = \left\{(i, t) \in \mathbb{N}^2: P_{i,t} = \gls{M}_{i,t} \land \tilde{A}_{i,t} \neq \tilde{B}_{i,t} \right\}$. The depth rule is now calculated as:
 \begin{equation}
@@ -103,7 +100,7 @@ \subsubsection{Depth Rule}\label{sec:depth-rule}
 
 \subsubsection{Trade Size Rule}\label{sec:trade-size-rule}
 
-Generally, quote-based approaches are preferred due to their strong performance. \textcite[][13]{grauerOptionTradeClassification2022} stress, however, that the quote rule systematically misclassifies limit orders, and propose an override. The trade size rule is defined on $\mathcal{A} = \left\{(i, t) \in \mathbb{N}^2: \tilde{P}_{i,t} = \tilde{A}_{i,t} \neq \tilde{B}_{i,t} \lor \tilde{P}_{i,t} \neq\tilde{A}_{i,t} = \tilde{B}_{i,t} \right\}$ as:
+Generally, quote-based approaches are preferred due to their strong performance. \textcite[\checkmark][13--14]{grauerOptionTradeClassification2022} stress, however, that the quote rule systematically misclassifies limit orders, and propose an override. The trade size rule is defined on $\mathcal{A} = \left\{(i, t) \in \mathbb{N}^2: \tilde{P}_{i,t} = \tilde{A}_{i,t} \neq \tilde{B}_{i,t} \lor \tilde{P}_{i,t} \neq\tilde{A}_{i,t} = \tilde{B}_{i,t} \right\}$ as:
 \begin{equation}
     \operatorname{tsize} \colon \mathcal{A} \to \mathcal{Y},\quad
     \operatorname{tsize}(i, t)=
@@ -113,10 +110,11 @@ \subsubsection{Trade Size Rule}\label{sec:trade-size-rule}
     \end{cases}
     \label{eq:trade-size-rule}
 \end{equation}
-The trade size rule in \cref{eq:trade-size-rule} classifies based on a match between the size of the trade $\tilde{P}_{i, t}$ and the quoted bid and ask sizes. The rationale is, that the market maker tries to fill the limit order of a customer, which results in the trade being executed at the contemporaneous bid or ask, with a trade size equaling the quoted size \autocite[][13]{grauerOptionTradeClassification2022}. When both the size of the ask and bid correspond with the trade size or the trade size does not match the quoted sizes, the result is ambiguous.
+The trade size rule in \cref{eq:trade-size-rule} classifies based on a match between the size of the trade $\tilde{P}_{i, t}$ and the quoted bid and ask sizes. The rationale is, that the market maker tries to fill the limit order of a customer, which results in the trade being executed at the contemporaneous bid or ask, with a trade size equaling the quoted size \autocite[\checkmark][13]{grauerOptionTradeClassification2022}. When both the size of the ask and bid correspond with the trade size or the trade size does not match the quoted sizes, the result is ambiguous.
 
-Expectedly, the improvement is highest for trades at the quotes and reverses for trades outside the quote \autocite[][15]{grauerOptionTradeClassification2022}. Based on these results, the trade size rule may only be applied selectively to trades near or at the quote. Since only a fraction of all trades can be classified with the trade size rule, the rule must be combined with other basic or hybrid rules for complete coverage. The subsequent section introduces four hybrid algorithms, that combine basic rules into more sophisticated algorithms.
+Expectedly, the improvement is highest for trades at the quotes and reverses for trades outside the quote \autocite[\checkmark][13]{grauerOptionTradeClassification2022}. Based on these results, the trade size rule may only be applied selectively to trades near or at the quote. Since only a fraction of all trades can be classified with the trade size rule, the rule must be combined with other basic or hybrid rules for complete coverage. The subsequent section introduces four hybrid algorithms, that combine basic rules into more sophisticated algorithms.
 
+\addtocontents{toc}{\protect\newpage}
 \subsection{Hybrid Rules}\label{sec:hybrid-rules}
 
 The basic trade classification rules from \cref{sec:basic-rules} can be combined into a hybrid algorithm to enforce universal applicability to all trades and improve the classification performance.
@@ -144,18 +142,18 @@ \subsection{Hybrid Rules}\label{sec:hybrid-rules}
                 \input{./Graphs/grauer-algo.pdf_tex}}
     }
     \hfill\null
-    \caption[Overview Over Hybrid Trade Classification Rules]{Overview Over hybrid trade classification rules. The Figure visualizes the components of the \acrshort{LR} algorithm, \acrshort{EMO} rule, the \acrshort{CLNV} method, and an arbitrary, stacked combination relative to the quotes. Rules at the midpoint or the quotes are slightly exaggerated for better readability. Own work inspired by \textcite[][167]{poppeSensitivityVPINChoice2016}.}
+    \caption[Overview Over Hybrid Trade Classification Rules]{Overview Over hybrid trade classification rules. The Figure visualizes the components of the \acrshort{LR} algorithm, \acrshort{EMO} rule, the \acrshort{CLNV} method, and an arbitrary, stacked combination relative to the quotes. Rules at the midpoint or the quotes are slightly exaggerated for better readability. Visualization inspired by \textcite[\checkmark][167]{poppeSensitivityVPINChoice2016}.}
     \label{fig:hybrid-algorithms}
 \end{figure}
 
 Popular variants include the \gls{LR} algorithm, the \gls{EMO} rule, and the \gls{CLNV} method. All three algorithms utilize the quote and tick rule to a varying extent, as depicted in \cref{fig:hybrid-lr,fig:hybrid-emo,fig:hybrid-clnv}. Basic rules are selected based on the proximity of the trade price to the quotes. We study all algorithms in detail in \cref{sec:lee-and-ready-algorithm,sec:ellis-michaely-ohara-rule,sec:chakarabarty-li-nguyen-van-ness-method}.
 
 
-As put forth by \textcite[][18]{grauerOptionTradeClassification2022}, basic or hybrid rules can be combined through stacking. One such combination is depicted in \cref{fig:hybrid-grauer}. This approach generalizes the aforementioned algorithms, as the applied rule is no longer dependent on the proximity to the quotes, but rather on the classifiability of the trade with the primary rules given by the domains and their ordering. We cover this approach last.
+As put forth by \textcite[][15]{grauerOptionTradeClassification2022}, basic or hybrid rules can be combined through stacking. One such combination is depicted in \cref{fig:hybrid-grauer}. This approach generalizes the aforementioned algorithms, as the applied rule is no longer dependent on the proximity to the quotes, but rather on the classifiability of the trade with the primary rules given by the domains and their ordering. We cover this approach last.
 
 \subsubsection{Lee and Ready Algorithm}\label{sec:lee-and-ready-algorithm}
 
-The popular \gls{LR} algorithm \autocite[][745]{leeInferringTradeDirection1991} combines the (reverse) tick test and quote rule into a single rule, which is derived from two observations. First, \textcite[][735--743]{leeInferringTradeDirection1991} observe a higher precision of the quote rule over the tick rule, which makes it their preferred choice. Second, by the means of a simple model, the authors demonstrate that the tick test can correctly classify at least \SI{85.00}{\percent} of all midspread trades if the model's assumptions of constant quotes between trades and the arrival of the market and standing orders following a Poisson process are met.
+The popular \gls{LR} algorithm \autocite[\checkmark][745]{leeInferringTradeDirection1991} combines the (reverse) tick test and quote rule into a single rule, which is derived from two observations. First, \textcite[\checkmark][735--743]{leeInferringTradeDirection1991} observe a higher precision of the quote rule over the tick rule, which makes it their preferred choice. Second, by the means of a simple model, the authors demonstrate that the tick test can correctly classify on average \SI{85.4}{\percent} of all midspread trades if the model's assumptions of constant quotes between trades and the arrival of the market and standing orders following a Poisson process are met.
 
 In combination, the algorithm primarily signs trades according to the quote rule. Trades at the midpoint of the spread, unclassifiable by the quote rule, are classified by the tick test. Overall:
 \begin{equation}
@@ -171,7 +169,7 @@ \subsubsection{Lee and Ready Algorithm}\label{sec:lee-and-ready-algorithm}
 \subsubsection{Ellis-Michaely-O'Hara
     Rule}\label{sec:ellis-michaely-ohara-rule}
 
-\textcite[][536]{ellisAccuracyTradeClassification2000} examine the performance of the previous algorithms for stocks traded at \gls{NASDAQ}. By analyzing miss-classified trades with regard to the proximity of the trade to the quotes, they observe, that the quote rule and by extension, the \gls{LR} algorithm, perform particularly well at classifying trades executed at the bid and the ask price but trail the performance of the tick rule for trades inside or outside the spread \autocite[][535--536]{ellisAccuracyTradeClassification2000}. The authors combine these observations into a single rule, known as the \gls{EMO} algorithm.
+\textcite[\checkmark][535--536]{ellisAccuracyTradeClassification2000} examine the performance of the previous algorithms for stocks traded at \gls{NASDAQ}. By analyzing miss-classified trades with regard to the proximity of the trade to the quotes, they observe, that the quote rule and by extension, the \gls{LR} algorithm, perform particularly well at classifying trades executed at the bid and the ask price but trail the performance of the tick rule for trades inside or outside the spread \autocite[\checkmark][535--536]{ellisAccuracyTradeClassification2000}. The authors combine these observations into a single rule, known as the \gls{EMO} algorithm.
 
 As such, the \gls{EMO} algorithm extends the tick rule by classifying trades at the quotes using the quote rule, and all other trades with the tick test. Formally, the classification rule is given by:
 \begin{equation}
@@ -184,12 +182,12 @@ \subsubsection{Ellis-Michaely-O'Hara
     \end{cases}
     \label{eq:emo-rule}
 \end{equation}
-\Cref{eq:emo-rule} embeds both the quote and tick rule. As trades off the quotes are classified by the tick rule, the algorithm's overall success rate is dominated by the tick test assuming most trades are off-the-quotes. For option markets \autocites[cp.][891]{savickasInferringDirectionOption2003}[][21]{grauerOptionTradeClassification2022} this dependence causes the performance to lag behind quote-based approaches, contrary to the successful adaption in the stock market \autocites[][541]{ellisAccuracyTradeClassification2000}[][3818]{chakrabartyTradeClassificationAlgorithms2007}. \textcite[][31--35]{grauerOptionTradeClassification2022} improve the classification accuracy for option trades by applying the reverse tick test as a proxy for the tick test.
+\Cref{eq:emo-rule} embeds both the quote and tick rule. As trades off the quotes are classified by the tick rule, the algorithm's overall success rate is dominated by the tick test assuming most trades are off-the-quotes. For option markets (e.g.,\textcites[\checkmark][891]{savickasInferringDirectionOption2003}[\checkmark][12--13]{grauerOptionTradeClassification2022}) this dependence causes the performance to lag behind quote-based approaches, contrary to the successful adaption in the stock market (e.g.,\autocites[\checkmark][541]{ellisAccuracyTradeClassification2000}[\checkmark][3813]{chakrabartyTradeClassificationAlgorithms2007}). \textcite[\checkmark][41--44]{grauerOptionTradeClassification2022} improve the classification accuracy for option trades by applying the reverse tick test as a proxy for the tick test.
 
 \subsubsection{Chakrabarty-Li-Nguyen-Van-Ness
     Method}\label{sec:chakarabarty-li-nguyen-van-ness-method}
 
-Like the previous two algorithms, the \gls{CLNV} method of \textcite[][3809]{chakrabartyTradeClassificationAlgorithms2012} is a hybrid of the quote and tick rule and extends the \gls{EMO} rule by a differentiated treatment of trades inside the quotes, which are notoriously hard to classify. The authors segment the bid-ask spread into deciles (ten equal-width bins) and classify trades around the midpoint (fourth to seventh decile) by the tick rule and trades close or outside the quotes are categorized by the tick rule.
+Like the previous two algorithms, the \gls{CLNV} method of \textcite[\checkmark][3811-3812]{chakrabartyTradeClassificationAlgorithms2007} is a hybrid of the quote and tick rule and extends the \gls{EMO} rule by a differentiated treatment of trades inside the quotes, which are notoriously hard to classify. The authors segment the bid-ask spread into deciles (ten equal-width bins) and classify trades around the midpoint (fourth to seventh decile) by the tick rule and trades close or outside the quotes are categorized by the tick rule.
 \begin{equation}
     \operatorname{clnv} \colon \mathbb{N}^2 \to \mathcal{Y}, \quad
     \operatorname{clnv}(i, t)=
@@ -201,17 +199,17 @@ \subsubsection{Chakrabarty-Li-Nguyen-Van-Ness
     \label{eq:CLNV-rule}
 \end{equation}
 
-The algorithm is summarized in \cref{eq:CLNV-rule}. It is derived from a performance comparison of the tick rule (\gls{EMO} rule) against the quote rule (\gls{LR} algorithm) on stock data, whereby the accuracy was assessed separately for each decile.\footnote{The spread is assumed to be positive and evenly divided into ten deciles and the first to third deciles are classified by the quote rule. Counted from the bid, the first decile starts at $B_{i,t}$ and ends at $B_{i,t} + \tfrac{3}{10} (A_{i,t} - B_{i,t}) = \tfrac{7}{10} B_{i,t} + \tfrac{3}{10} A_{i,t}$ third decile. As all trade prices are below the midpoint, they are classified as a sell.} The classical \gls{CLNV} method uses the backward-looking tick rule. In the spirit of \textcite[][735]{leeInferringTradeDirection1991}, the tick test can be exchanged for the reverse tick test.
+The algorithm is summarized in \cref{eq:CLNV-rule}. It is derived from a performance comparison of the tick rule (\gls{EMO} rule) against the quote rule (\gls{LR} algorithm) on stock data, whereby the accuracy was assessed separately for each decile.\footnote{The spread is assumed to be positive and evenly divided into ten deciles and the first to third deciles are classified by the quote rule. Counted from the bid, the first decile starts at $B_{i,t}$ and ends at $B_{i,t} + \tfrac{3}{10} (A_{i,t} - B_{i,t}) = \tfrac{7}{10} B_{i,t} + \tfrac{3}{10} A_{i,t}$ third decile. As all trade prices are below the midpoint, they are classified as a sell.} The classical \gls{CLNV} method uses the backward-looking tick rule. In the spirit of \textcite[\checkmark][735]{leeInferringTradeDirection1991}, the tick test can be exchanged for the reverse tick test.
 
 \subsubsection{Stacked Rule}\label{sec:stacked-rule}
 
-The previous algorithms are static concerning the used base rules and their alignment. Combining arbitrary rules into a single algorithm requires a generic procedure. \textcite[][18]{grauerOptionTradeClassification2022} combine basic and hybrid rules through stacking. In this setting, the trade traverses a stack of pre-defined rules until a rule can classify the trade or the end of the stack is reached.\footnote{For a trade, which cannot be classified by any classifier, one may fallback on a random assignment or the majority class if the distribution of trades is imbalanced.} The classification is now dependent on the employed rules but also on their relative ordering.
+The previous algorithms are static concerning the used base rules and their alignment. Combining arbitrary rules into a single algorithm requires a generic procedure. \textcite[\checkmark][15]{grauerOptionTradeClassification2022} combine basic and hybrid rules through stacking. In this setting, the trade traverses a stack of pre-defined rules until a rule can classify the trade or the end of the stack is reached.\footnote{For a trade, which cannot be classified by any classifier, one may fallback on a random assignment or the majority class if the distribution of trades is imbalanced.} The classification is now dependent on the employed rules but also on their relative ordering.
 
-The most basic application is in the \gls{LR} algorithm, combining $\operatorname{quote} \to \operatorname{tick}$, whereby the quote rule is applied first as indicated by the arrow. For a more complex example consider the hybrid rule consisting of $\operatorname{tsize}_{\mathrm{ex}} \to \operatorname{quote}_{\mathrm{nbbo}} \to \operatorname{quote}_{\mathrm{ex}} \to \operatorname{depth}_{\mathrm{nbbo}} \to \operatorname{depth}_{\mathrm{ex}} \to \operatorname{rtick}_{\mathrm{all}}$ popularized in \textcite[][18]{grauerOptionTradeClassification2022}. Only a fraction of all trades are classifiable by the trade size rule, which is the primary rule, due to a narrow domain, and classification is deferred to lower rules in the stack, specifically the quote rule at the \gls{NBBO}, which by design has larger coverage. Theoretically, stacked rules can grow to great depth with an arbitrary arrangement. In practice, rules may be ordered greedily and new rules added if there are unclassified trades.
+The most basic application is in the \gls{LR} algorithm, combining $\operatorname{quote} \to \operatorname{tick}$, whereby the quote rule is applied first as indicated by the arrow. For a more complex example consider the hybrid rule consisting of $\operatorname{tsize}_{\mathrm{ex}} \to \operatorname{quote}_{\mathrm{nbbo}} \to \operatorname{quote}_{\mathrm{ex}} \to \operatorname{depth}_{\mathrm{nbbo}} \to \operatorname{depth}_{\mathrm{ex}} \to \operatorname{rtick}_{\mathrm{all}}$ popularized in \textcite[\checkmark][15]{grauerOptionTradeClassification2022}. Only a fraction of all trades are classifiable by the trade size rule, which is the primary rule, due to a narrow domain, and classification is deferred to lower rules in the stack, specifically the quote rule at the \gls{NBBO}, which by design has larger coverage. Theoretically, stacked rules can grow to great depth with an arbitrary arrangement. In practice, rules may be ordered greedily and new rules added if there are unclassified trades.
 
-\textcite[][3811]{chakrabartyTradeClassificationAlgorithms2007} and \textcite[][18]{grauerOptionTradeClassification2022} continue the trend for more complex classification rules, leading to a higher fragmented decision surface, and eventually resulting in improved classification accuracy. Since the condition, for the selection of the base rule, is inferred from \emph{static} cut-off points at the decile boundaries of the spread including the midspread and quotes. This raises the question of whether classifiers trained on price and quote data can adapt to the data and improve upon classical trade classification rules.
+\textcite[\checkmark][3812]{chakrabartyTradeClassificationAlgorithms2007} and \textcite[\checkmark][15]{grauerOptionTradeClassification2022} continue the trend for more complex classification rules, leading to a higher fragmented decision surface, and eventually resulting in improved classification accuracy. Since the condition, for the selection of the base rule, is inferred from \emph{static} cut-off points at the decile boundaries of the spread including the midspread and quotes. This raises the question of whether classifiers trained on price and quote data can adapt to the data and improve upon classical trade classification rules.
 
-The trend towards sophisticated, hybrid rules, combining as many as six base rules into a single classifier, has conceptual parallels to stacked ensembles found in machine learning and expresses the need for better classifiers.
+The trend towards sophisticated, hybrid rules, combining up to six base rules into a single classifier, has conceptual parallels to stacked ensembles found in machine learning and expresses the need for better classifiers.
 
 We provide an overview of state-of-the-art machine learning-based classifiers and start by framing trade classification as a supervised learning
 problem.
\ No newline at end of file
diff --git a/reports/Content/semisupervised-approaches.tex b/reports/Content/semisupervised-approaches.tex
index 4f29621a..75b448aa 100644
--- a/reports/Content/semisupervised-approaches.tex
+++ b/reports/Content/semisupervised-approaches.tex
@@ -4,9 +4,9 @@ \section{Semi-Supervised Approaches}\label{sec:semi-supervised-approaches}
 
 \subsection{Framing as a Semi-supervised Learning Problem}\label{sec:problem-framing-2}
 
-The supervised approaches depend on the availability of the trade initiator as the true label. Yet, obtaining the label is often restricted to the rare cases, where the trade initiator is provided by the exchange or to subsets of trades where the initiator can be inferred through matching procedures (cp. \cref{sec:trade-initiator}), which may bias the selection. Unlabeled trades, though, are abundant and can help improve the generalization performance of the classifier. This concern is addressed by semi-supervised methods.
+The supervised approaches depend on the availability of the trade initiator as the true label. Yet, obtaining the label is often restricted to the rare cases, where the trade initiator is provided by the exchange or to subsets of trades where the initiator can be inferred through matching procedures (cp. \cref{sec:trade-initiator}). This may bias the selection. Unlabeled trades, though, are abundant and can help improve the generalization performance of the classifier. This concern is addressed by semi-supervised methods.
 
-Semi-supervised methods leverage partially-labeled data by learning an algorithm on unlabeled instances alongside true labels \autocite[][6]{chapelleSemisupervisedLearning2006}. They are centered around the semi-supervised assumption of smoothness, which states that if two samples say $\mathbf{x}_{1}$ and $\mathbf{x}_{2}$, are nearby in a high-density region, their class labels $y_{1}$ and $y_{2}$ should also be similar. Vice versa, if data points are separated by a low-density region, their labels may be different \autocite[][5]{chapelleSemisupervisedLearning2006}.
+Semi-supervised methods leverage partially-labeled data by learning an algorithm on unlabeled instances alongside true labels \autocite[\checkmark][2]{chapelleSemisupervisedLearning2006}. They are centered around the semi-supervised assumption of smoothness, which states that if two samples say $\mathbf{x}_{1}$ and $\mathbf{x}_{2}$ are nearby in a high-density region, their class labels $y_{1}$ and $y_{2}$ should also be similar. Vice versa, if data points are separated by a low-density region, their labels may be different \autocite[\checkmark][5]{chapelleSemisupervisedLearning2006}.
 
 \begin{figure}[ht]
     \centering
@@ -25,61 +25,61 @@ \subsection{Selection of Approaches}\label{sec:selection-of-approaches-1}
 
 \textbf{Gradient Boosting}
 
-The success of supervised gradient boosting led to the development of gradient boosting for the semi-supervised setting. An early work of \textcite[][3--4]{dalche-bucSemisupervisedMarginBoost2001} explores replacing supervised weak learners, i.e., regression trees, with semi-supervised weak learners, i.e., mixture models and minimizes a loss function over labeled and unlabeled instances. Another line of research, including \textcites[][290--291]{bennettExploitingUnlabeledData2002}[][2003--2004]{mallapragadaSemiBoostBoostingSemiSupervised2009}, retain supervised weak learners to generate pseudo labels of unlabeled instances per iteration. True labeled and pseudo-labeled data is then used in fitting weak learners of subsequent iterations. Approaches differ regarding the selection criterion of the pseudo-labeled instances. Both lines of work, however, require changes to the boosting procedure or the base learners.
+The success of supervised gradient boosting led to the development of gradient boosting for the semi-supervised setting. An early work of \textcite[\checkmark][555--556]{dalche-bucSemisupervisedMarginBoost2001} explores replacing supervised weak learners, i.e., regression trees, with semi-supervised weak learners, i.e., mixture models and minimizes a loss function over labeled and unlabeled instances. Another line of research, including \textcites[\checkmark][290--291]{bennettExploitingUnlabeledData2002}[\checkmark][2003--2004]{mallapragadaSemiBoostBoostingSemiSupervised2009}, retain supervised weak learners to generate pseudo labels of unlabeled instances per iteration. True labeled and pseudo-labeled data is then used in fitting weak learners of subsequent iterations. Approaches differ regarding the selection criterion of the pseudo-labeled instances. Both lines of work, however, require changes to the boosting procedure or the base learners.
 
-An alternative is to pair gradient boosting with self-training. Self-training is a wrapper algorithm around a supervised classifier, that incorporates its most-confident predictions of unlabeled instances into the training procedure \autocite[][190]{yarowskyUnsupervisedWordSense1995}. In contrast to previous methods, pseudo-labels are generated exclusively from the fully-fledged ensemble, which is grown multiple times at a higher computational cost. Being a model-agnostic wrapper, it does not change the classifier and ensures maximum comparability. This, together with the widespread adoption in the literature, makes it a compelling choice for semi-supervised trade classification.
+An alternative is to pair gradient boosting with self-training. Self-training is a wrapper algorithm around a supervised classifier, that incorporates its most-confident predictions of unlabeled instances into the training procedure \autocite[\checkmark][190]{yarowskyUnsupervisedWordSense1995}. In contrast to previous methods, pseudo-labels are generated exclusively from the fully-fledged ensemble, which is grown multiple times at a higher computational cost. Being a model-agnostic wrapper, it does not change the classifier and ensures maximum comparability. This, together with the widespread adoption in the literature, makes it a compelling choice for semi-supervised trade classification.
 
 \textbf{Transformer}
 
-Whilst Transformers could be combined with self-training, a more promising approach is to pre-train Transformers on unlabeled data, and then fine-tune the network on the remaining labeled instances. Various studies report unanimously performance improvements from pre-training tabular Transformers, including \textcites[][8]{somepalliSaintImprovedNeural2021}[][7]{huangTabTransformerTabularData2020}.
+Whilst Transformers could be combined with self-training, a more promising approach is to pre-train Transformers on unlabeled data, and then fine-tune the network on the remaining labeled instances. Various studies report unanimously performance improvements from pre-training tabular Transformers, including \textcites[\checkmark][8]{somepalliSaintImprovedNeural2021}[\checkmark][7--8]{huangTabTransformerTabularData2020}.
 
-Until now we assumed the parameters e.g., weights and biases, of the Transformer to be initialized randomly. The joint goal of pre-training objectives is to initialize a neural network with weights that capture expressive representations of the input and thereby improve generalization performance over a random initialization when fine-tuning on a specific task \autocite[][12]{erhanWhyDoesUnsupervised}. The training is now decomposed into two stages: in the first stage the model is trained with respect to the pre-training objective to obtain the parameter estimates on unlabeled instances, and in the second stage the Transformer is initialized with the parameters and then finetuned on the labeled dataset. Particularly beneficial, general embeddings can be learned during pre-training, even if the true label, i.e., the trade initiator, is unknown or its definition varies between tasks.
+Until now we assumed the parameters e.g., weights and biases, of the Transformer to be initialized randomly. The joint goal of pre-training objectives is to initialize a neural network with weights that capture expressive representations of the input and thereby improve generalization performance over a random initialization when fine-tuning on a specific task \autocite[\checkmark][636]{erhanWhyDoesUnsupervised}. The training is now decomposed into two stages: in the first stage the model is trained with respect to the pre-training objective to obtain the parameter estimates on unlabeled instances, and in the second stage the Transformer is initialized with the parameters and then finetuned on the labeled dataset. Particularly beneficial, general embeddings can be learned during pre-training, even if the true label, i.e., the trade initiator, is unknown or its definition varies between tasks.
 
-Pre-training objectives for tabular data differ vastly in their methodology and are often directly adapted from other domains including \gls{MLM} \autocite[][4174]{devlinBERTPretrainingDeep2019}, \gls{RTD} \autocite[][1--3]{clarkElectraPretrainingText2020}, or contrastive learning \autocite[][2]{chenSimpleFrameworkContrastive2020}.
-As such, \textcite[][7]{huangTabTransformerTabularData2020} adapt \gls{MLM}, whereby features are randomly masked and the objective is to reconstruct the original input. Pre-training by \gls{RTD} aims to identify randomly replaced features and recover a binary mask used for replacement \autocite[][7]{huangTabTransformerTabularData2020}. \textcites[][3]{bahriSCARFSelfsupervisedContrastive2022}[][4--5]{yoonVIMEExtendingSuccess2020} reconstruct both the binary feature mask and the original input simultaneously. \textcite[][3]{somepalliSaintImprovedNeural2021} alter the methodology of \textcite[][4--5]{yoonVIMEExtendingSuccess2020} through a contrastive loss function.
+Pre-training objectives for tabular data differ vastly in their methodology and are often directly adapted from other domains including \gls{MLM} \autocite[\checkmark][4174]{devlinBERTPretrainingDeep2019}, \gls{RTD} \autocite[\checkmark][2--3]{clarkElectraPretrainingText2020}, or contrastive learning \autocite[\checkmark][1598]{chenSimpleFrameworkContrastive2020}.
+As such, \textcite[\checkmark][7]{huangTabTransformerTabularData2020} adapt \gls{MLM}, whereby features are randomly masked and the objective is to reconstruct the original input. Pre-training by \gls{RTD} aims to identify randomly replaced features and recover a binary mask used for replacement \autocite[\checkmark][7]{huangTabTransformerTabularData2020}. \textcites[\checkmark][3--4]{bahriSCARFSelfsupervisedContrastive2022}[][4--5]{yoonVIMEExtendingSuccess2020} reconstruct both the binary feature mask and the original input simultaneously. \textcite[\checkmark][3]{somepalliSaintImprovedNeural2021} alter the methodology of \textcite[\checkmark][11036--11037]{yoonVIMEExtendingSuccess2020} through a contrastive loss function.
 
-With a multitude of methods, tested on different datasets and neural architectures, a fair comparison between pre-training methods is tedious. Yet, \textcite[][2-3]{rubachevRevisitingPretrainingObjectives2022} provide guidance in selecting objectives. Among the pre-training objectives that they benchmark, the \gls{RTD} objective was among the best-performing approaches. The \gls{RTD} objective is easy to optimize, unsupervised, and leaves the model architecture unaltered, which makes \gls{RTD} a compelling choice for pre-training on unlabeled data.
+With a multitude of methods, tested on different datasets and neural architectures, a fair comparison between pre-training methods is tedious. Yet, \textcite[\checkmark][2-3]{rubachevRevisitingPretrainingObjectives2022} provide guidance in selecting objectives. Among the pre-training objectives that they benchmark, the \gls{RTD} objective was among the best-performing approaches. The \gls{RTD} objective is easy to optimize, unsupervised, and leaves the model architecture unaltered, which makes \gls{RTD} a compelling choice for pre-training on unlabeled data.
 
 The next chapter covers self-training in detail.
 
 \subsection{Gradient Boosted Trees With Self-Training}\label{sec:extensions-to-gradient-boosted-trees}
 
-Self-training is a wrapper algorithm around a probabilistic classifier, that incorporates its predictions of unlabeled instances as pseudo labels \autocite[][190]{yarowskyUnsupervisedWordSense1995}.
+Self-training is a wrapper algorithm around a probabilistic classifier, that incorporates its predictions of unlabeled instances as pseudo labels \autocite[\checkmark][190]{yarowskyUnsupervisedWordSense1995}.
 
-Initially, a base classifier is fitted on the labeled data points in a supervised manner. The classifier then assigns labels, so-called pseudo labels, to unlabeled instances. A subset of unlabeled instances with high-confidence predictions is selected, removed from the unlabeled dataset and added to the pseudo-labeled data dataset. A new classifier is then retrained on the labeled and pseudo-labeled instances \autocite[][190--192]{yarowskyUnsupervisedWordSense1995}. The process is repeated for several iterations until an abortion criterion applies, such as the maximum number of iterations is exhausted or when no unlabeled instances are left to label.
+Initially, a base classifier is fitted on the labeled data points in a supervised manner. The classifier then assigns labels, so-called pseudo labels, to unlabeled instances. A subset of unlabeled instances with high-confidence predictions is selected, removed from the unlabeled dataset and added to the pseudo-labeled data dataset. A new classifier is then retrained on the labeled and pseudo-labeled instances \autocite[\checkmark][190--192]{yarowskyUnsupervisedWordSense1995}. The process is repeated for several iterations until an abortion criterion applies, such as the maximum number of iterations is exhausted or when no unlabeled instances are left to label.
 
 Recall from our discussion on gradient-boosted trees in \cref{sec:gradient-boosting-procedure} that we optimized for the cross-entropy loss on the training set. When coupled with self-training in each training iteration the classifier $F$ now jointly minimizes the loss over the labeled samples $\mathcal{D}$ and the pseudo-labeled samples $\not{\mathcal{U}}$:
 \begin{equation}
     L_{\mathrm{ST}}=\frac{1}{\left|\mathcal{D}\right|} \sum_{(\mathbf{x}, y) \in \mathcal{D}} L(F(\mathbf{x}), y)+\frac{\epsilon}{\left|\not{\mathcal{U}}\right|} \sum_{(\mathbf{x}, \tilde{y}) \in \not{\mathcal{U}}} L(F(\mathbf{x}), \tilde{y})+\lambda\|F\|^2,
 \end{equation}
-where $\epsilon$ is a hyperparameter to control the impact of the pseudo-labeled data, $\tilde{y}$ is the pseudo-labeled instance, and $\lambda$ weights the regularization term \autocite[][4]{aminiSelfTrainingSurvey2023}.
+where $\epsilon$ is a hyperparameter to control the impact of the pseudo-labeled data, $\tilde{y}$ is the pseudo-labeled instance, and $\lambda$ weights the regularization term \autocite[\checkmark][4]{aminiSelfTrainingSurvey2023}.
 
-In every iteration, only unlabeled instances are added to the training set, for which the predicted class probability exceeds a confidence threshold, say $\tau$. This approach has implications, as highlighted by \textcite[][2]{chenDebiasedSelfTrainingSemiSupervised2022}. The threshold $\tau$ becomes an important hyperparameter in controlling that no noisy labels are added to the training set, but a restriction to highly-confidence samples may lead to a data bias and over-confidence in the prediction. Self-training is prone to a confirmation bias, as confident but wrong pseudo labels are erroneously incorporated into the training set, which in effect leads to a propagation of errors in the subsequent training rounds.
+In every iteration, only unlabeled instances are added to the training set, for which the predicted class probability exceeds a confidence threshold, say $\tau$. This approach has implications, as highlighted by \textcite[\checkmark][32427]{chenDebiasedSelfTrainingSemiSupervised2022}. The threshold $\tau$ becomes an important hyperparameter in controlling that no noisy labels are added to the training set, but a restriction to highly-confidence samples may lead to a data bias and over-confidence in the prediction. Self-training is prone to a confirmation bias, as confident but wrong pseudo labels are erroneously incorporated into the training set, which in effect leads to a propagation of errors in the subsequent training rounds.
 
-At the same time, self-training puts a high emphasis on the correctness of the probability estimates in the base classifier. This is problematic for decision trees, known to produce poor probability estimates, as probabilities are derived from the class frequency in the leaf node containing few samples \autocite[][357--358]{tanhaSemisupervisedSelftrainingDecision2017}. However, as gradient boosting directly optimizes for the cross-entropy loss, the problem found for its ensemble member no longer occurs.
+At the same time, self-training puts a high emphasis on the correctness of the probability estimates in the base classifier. This is problematic for decision trees, known to produce poor probability estimates, as probabilities are derived from the class frequency in the leaf node containing few samples \autocite[\checkmark][357--358]{tanhaSemisupervisedSelftrainingDecision2017}. However, as gradient boosting directly optimizes for the cross-entropy loss, the problem found for its ensemble member no longer occurs.
 
-Independent of the base classifier, self-training increases computational cost, as training is repeated over several iterations on a growing training set \autocite[][9]{zophRethinkingPretrainingSelftraining2020}. Despite these limitations, the potentially improved decision boundary outweighs the concerns.
+Independent of the base classifier, self-training increases computational cost, as training is repeated over several iterations on a growing training set \autocite[\checkmark][3841]{zophRethinkingPretrainingSelftraining2020}. Despite these limitations, the potentially improved decision boundary outweighs the concerns.
 
 \subsection{Transformers With Pre-training}\label{sec:extensions-to-transformer}
 
-\gls{RTD} is a pre-training objective proposed by \textcite[][2--3]{clarkElectraPretrainingText2020} for the use in language models. The core idea is to randomly replace tokens with plausible alternatives and learn a binary classifier to distinguish between original and replaced tokens. Intuitionally, the random replacement forces the model to learn generalizable representations of the input, rather than memorizing the co-occurrence of certain tokens. Additionally, surprising the model with random tokens strengthens its ability to incorporate contextual information.
+\gls{RTD} is a pre-training objective proposed by \textcite[\checkmark][2--3]{clarkElectraPretrainingText2020} for the use in language models. The core idea is to randomly replace tokens with plausible alternatives and learn a binary classifier to distinguish between original and replaced tokens. Intuitionally, the random replacement forces the model to learn generalizable representations of the input, rather than memorizing the co-occurrence of certain tokens. Additionally, surprising the model with random tokens strengthens its ability to incorporate contextual information.
 
 \begin{figure}[ht]
     \centering
     {\renewcommand\normalsize{\small}
         \normalsize
         \input{./Graphs/random-token-replacement.pdf_tex}}
-    \caption[Replaced Token Detection]{Replaced Token Detection. Own work inspired by \autocite[][2--3]{clarkElectraPretrainingText2020}.}
+    \caption[Replaced Token Detection]{Replaced Token Detection. Visualization inspired by \autocite[\checkmark][3]{clarkElectraPretrainingText2020}.}
     \label{fig:random-token-replacement}
 \end{figure}
 
 \todo{Adapt to tabular data}
 
-The approach uses two neural networks, namely the generator and the discriminator, typically implemented as Transformers, as visualized in \cref{fig:random-token-replacement}.  The generator is responsible for generating replacement tokens and receives an input sequence, i.e., a sentence, that has been intentionally masked out. It learns to predict the original token of the now-masked token through tokens in the bidirectional context (cp. \cref{sec:attention}). For masking, an additional $\mathtt{[MASK]}$ token is introduced, which extends the vocabulary (cp. \cref{sec:token-embeddings}). Separately for each token, the final hidden state of the masked token is fed through a softmax activation to obtain the predicted probability distribution of the masked token and the cross entropy loss is used to compare against the true distribution. By replacing the masked token with a token from the generator distribution, convincing replacements now take place for some of the original inputs \autocite[][2--3]{clarkElectraPretrainingText2020}.
+The approach uses two neural networks, namely the generator and the discriminator, typically implemented as Transformers, as visualized in \cref{fig:random-token-replacement}.  The generator is responsible for generating replacement tokens and receives an input sequence, i.e., a sentence, that has been intentionally masked out. It learns to predict the original token of the now-masked token through tokens in the bidirectional context (cp. \cref{sec:attention}). For masking, an additional $\mathtt{[MASK]}$ token is introduced, which extends the vocabulary (cp. \cref{sec:token-embeddings}). Separately for each token, the final hidden state of the masked token is fed through a softmax activation to obtain the predicted probability distribution of the masked token and the cross entropy loss is used to compare against the true distribution. By replacing the masked token with a token from the generator distribution, convincing replacements now take place for some of the original inputs \autocite[\checkmark][2--3]{clarkElectraPretrainingText2020}.
 
-The discriminator then receives the corrupted input sequence and is trained to distinguish between original and replaced tokens originating from the generator. The output is a binary mask to be compared against the mask initially used for masking tokens in the generator \autocite[][2--3]{clarkElectraPretrainingText2020}.
+The discriminator then receives the corrupted input sequence and is trained to distinguish between original and replaced tokens originating from the generator. The output is a binary mask to be compared against the mask initially used for masking tokens in the generator \autocite[\checkmark][2--3]{clarkElectraPretrainingText2020}.
 
-Applied to tabular datasets, \gls{RTD} transfers to randomly replacing feature values in $\mathbf{x}_{i}$ instead of sequences. The objective is now to predict a binary mask $\mathbf{m}_{i}\in \{0,1\}^{M}$ corresponding to $\mathbf{x}_{i}$, indicating which features, or entries in $\mathbf{x}_{i}$, have been replaced. Previous adaptions for tabular data, e.g., \textcite[][3]{huangTabTransformerTabularData2020}, simplify the replacement strategy by sampling replacement values directly from the feature, which alleviates the need for a generator network and requires less compute. Since the replacement is done on a feature-per-feature basis, the replaced token is \emph{per se} harder to detect.
+Applied to tabular datasets, \gls{RTD} transfers to randomly replacing feature values in $\mathbf{x}_{i}$ instead of sequences. The objective is now to predict a binary mask $\mathbf{m}_{i}\in \{0,1\}^{M}$ corresponding to $\mathbf{x}_{i}$, indicating which features, or entries in $\mathbf{x}_{i}$, have been replaced. Previous adaptions for tabular data, e.g., \textcite[\checkmark][3]{huangTabTransformerTabularData2020}, simplify the replacement strategy by sampling replacement values directly from the feature, which alleviates the need for a generator network and requires less compute. Since the replacement is done on a feature-per-feature basis, the replaced token is \emph{per se} harder to detect.
 
 For tabular the random replacement of feature values also strengthens the model's ability to incorporate a combination of features, rather than a single or few features based on their absolute value, which would facilitate overfitting.
 
diff --git a/reports/Content/supervised-approaches.tex b/reports/Content/supervised-approaches.tex
index a877ac82..a8cb342b 100644
--- a/reports/Content/supervised-approaches.tex
+++ b/reports/Content/supervised-approaches.tex
@@ -9,21 +9,21 @@ \subsection{Framing as a Supervised Learning Problem}\label{sec:problem-framing}
 All trade classification rules from \cref{sec:rule-based-approaches} perform discrete classification and assign a class to the trade. They are inherently unsupervised. Our focus is on supervised classification, where a classifier learns a function mapping between the input and the label, which represents the trade initiator.
 
 More insightful, is to not just obtain the most probable class, but also the associated class probabilities for a trade to be a buy or sell. This gives insights into the quality of the prediction.
-Thus, we frame trade signing as a supervised, probabilistic classification task. This is similar to \textcite[][272]{easleyDiscerningInformationTrade2016}, who alter the tick rule and \gls{BVC} algorithm to obtain the probability estimates of a buy from an individual or aggregated trades, but with a sole focus on trade signing on a trade-by-trade basis and supervised. For machine learning-based classifiers, a probabilistic view enables a richer evaluation but restricts the selection of classifiers. Trade classification rules, as presented in \cref{sec:rule-based-approaches}, do not profit from this alternative formulation as they yield hard probabilities only and so no insight into the confidence of the prediction is gained.
+Thus, we frame trade signing as a supervised, probabilistic classification task. This is similar to \textcite[\checkmark][272]{easleyDiscerningInformationTrade2016}, who alter the tick rule and \gls{BVC} algorithm to obtain the probability estimates of a buy from an individual or aggregated trades, but with a sole focus on trade signing on a trade-by-trade basis and supervised. For machine learning-based classifiers, a probabilistic view enables a richer evaluation but restricts the selection of classifiers. Trade classification rules, as presented in \cref{sec:rule-based-approaches}, do not profit from this alternative formulation as they yield hard probabilities only and so no insight into the confidence of the prediction is gained.
 
-We introduce more notation, which is used throughout. Each data instance consists of a feature vector and the target. The former is given by $\mathbf{x} \in \mathbb{R}^{1 \times M}$ and described by a random variable $X$. Any of the $M$ features in $\mathbf{x}$ may be numerical, e.g., the trade price or categorical e.g., the security type. Like before, the target is given by $y \in \mathcal{Y}$ and described by a random variable $Y$. Each data instance is sampled from a joint probability distribution $\Pr(X, Y)$. The labeled data set with $N$ i.i.d. samples is denoted by $\mathcal{D} =\left\{\left(\mathbf{x}_i, y_i\right)\right\}_{i=1}^N$. For convienience, we define a feature matrix $\mathbf{X}=\left[\mathbf{x}_1,\ldots, \mathbf{x}_N\right]^{\top}$, that stores all instances and a corresponding vector of labels $\mathbf{y}=\left[y_1,\ldots, y_N \right]^{\top}$.
+We introduce more notation, which is used throughout. Each data instance consists of a feature vector and the target. The former is given by $\mathbf{x} \in \mathbb{R}^{1 \times M}$ and described by a random variable $X$. Any of the $M$ features in $\mathbf{x}$ may be numerical, e.g., the trade price, or categorical e.g., the security type. Like before, the target is given by $y \in \mathcal{Y}$ and described by a random variable $Y$. Each data instance is sampled from a joint probability distribution $\Pr(X, Y)$. The labeled data set with $N$ i.i.d. samples is denoted by $\mathcal{D} =\left\{\left(\mathbf{x}_i, y_i\right)\right\}_{i=1}^N$. For convenience, we define a feature matrix $\mathbf{X}=\left[\mathbf{x}_1,\ldots, \mathbf{x}_N\right]^{\top}$, that stores all instances and a corresponding vector of labels $\mathbf{y}=\left[y_1,\ldots, y_N \right]^{\top}$.
 
 For our machine learning classifiers, we aim to model $\Pr_{\theta}(y \mid \mathbf{x})$ by fitting a classifier with the parameters $\theta$ on the training set. Given the estimated class probabilities, we retrieve the most probable class in $\mathcal{Y}$ as:
 \begin{equation}
     \hat{y}=\arg\max_{y \in \mathcal{Y}} \operatorname{Pr}(y \mid \mathbf{x}).
     \label{eq:class-from-prob}
 \end{equation}
-\cref{eq:class-from-prob} allow alternating between a discrete and probabilistic formulation for trade classification. This enables a seamless comparison of classical rules and probabilistic classifiers in machine learning.
+\cref{eq:class-from-prob} allows alternating between a discrete and probabilistic formulation for trade classification. This enables a seamless comparison of classical rules and probabilistic classifiers in machine learning.
 Next, we discuss state-of-the-art classifiers suitable for trade classification.
 
 \subsection{Selection of Approaches}\label{sec:selection-of-approaches}
 
-In this thesis, we perform a succinct literature discussion to select a set of supervised classifiers based on empirical evidence. In anticipation of the results, we ultimately select the FT-Transformer and Gradient Boosting for trade classification. To guide our discussion, we establish the following requirements a classifier must fullfil:
+We perform a succinct literature discussion to select a set of supervised classifiers based on empirical evidence. In anticipation of the results, we select the FT-Transformer and Gradient Boosting for trade classification. To guide our discussion, we establish the following requirements a classifier must fulfill:
 \begin{enumerate}[label=(\roman*),noitemsep]
 \item \emph{performance:} The approach must deliver state-of-the-art performance in tabular classification tasks. Trades are typically provided as tabular datasets, consisting of rows representing instances and columns representing features. The classifier must be well-suited for probabilistic classification on tabular data.
 \item \emph{scalability:}  The approach must scale to datasets with > 10~Mio. samples. Due to the high trading activity and long data history, datasets may comprise millions of samples, so classifiers must cope with large quantities of trades.
@@ -34,32 +34,32 @@ \subsection{Selection of Approaches}\label{sec:selection-of-approaches}
 
 \textbf{Wide Tree-Based Ensembles}
 
-Traditionally, tree-based ensembles, in particular, \gls{GBRT} have dominated modeling on tabular data concerning predictive performance \autocites[][24--25]{grinsztajnWhyTreebasedModels2022}[][7]{kadraWelltunedSimpleNets2021}[][8]{gorishniyRevisitingDeepLearning2021}. At its core, tree-based ensembles combine the estimates of individual decision trees into an ensemble to obtain a more accurate prediction. For \gls{GBRT} \autocite[][9]{friedmanGreedyFunctionApproximation2001} the ensemble is constructed by sequentially adding small-sized trees into the ensemble that improve upon the error of the previous trees. Conceptually related to \glspl{GBRT} are random forests. Random forests \autocite[][6]{breimanRandomForests2001} fuse decision trees with the bagging principle \autocite[][123]{breimanBaggingPredictors1996} by growing multiple deep decision trees on random subsets of data and aggregating the individual estimates. 
+Traditionally, tree-based ensembles, in particular, \gls{GBRT} have dominated modeling on tabular data concerning predictive performance \autocites[\checkmark][512]{grinsztajnWhyTreebasedModels2022}[\checkmark][23935]{kadraWelltunedSimpleNets2021}[\checkmark][18939]{gorishniyRevisitingDeepLearning2021}. At its core, tree-based ensembles combine the estimates of individual decision trees into an ensemble to obtain a more accurate prediction. For \gls{GBRT} \autocite[\checkmark][1193]{friedmanGreedyFunctionApproximation2001} the ensemble is constructed by sequentially adding small-sized trees into the ensemble that improve upon the error of the previous trees. Conceptually related to \glspl{GBRT} are random forests. Random forests \autocite[\checkmark][6]{breimanRandomForests2001} fuse decision trees with the bagging principle \autocite[\checkmark][123]{breimanBaggingPredictors1996} by growing multiple deep decision trees on random subsets of data and aggregating the individual estimates. 
 
-\textcite[][7-9]{grinsztajnWhyTreebasedModels2022} trace back the strong performance of tree-based ensembles in tabular classification tasks to being a non-rotationally-invariant learner and tabular data being non-invariant to rotation. By intuition, rows and columns in a tabular dataset may be arranged in an arbitrary order, but each features carries a distinct meaning, which implies that feature values cannot be simply rotated without affecting the overall meaning. Thus, tabular data is non-invariant by rotation. So are tree-based ensembles, as they attend to each feature separately. This property also strengthens the model's ability to uninformative features \autocite[][8-9]{grinsztajnWhyTreebasedModels2022}.
+\textcite[\checkmark][513--515]{grinsztajnWhyTreebasedModels2022} trace back the strong performance of tree-based ensembles in tabular classification tasks to being a non-rotationally-invariant learner and tabular data being non-invariant to rotation. By intuition, rows and columns in a tabular dataset may be arranged in an arbitrary order, but each features carries a distinct meaning, which implies that feature values cannot be simply rotated without affecting the overall meaning. Thus, tabular data is non-invariant by rotation. So are tree-based ensembles, as they attend to each feature separately. This property also strengthens the model's ability to uninformative features \autocite[\checkmark][513--515]{grinsztajnWhyTreebasedModels2022}.
 
-\textcite[][13--14]{ronenMachineLearningTrade2022} have unparalleled success in classifying trades through random forests. Due to the framing as a probabilistic classification task, random forests are not optimal. This is because decision trees yield poorly calibrated probability estimates caused by limited samples in leaf nodes, which propagate to the ensemble \autocite[][356--360]{tanhaSemisupervisedSelftrainingDecision2017}. Gradient boosting is unaffected by this problem, and scales to large data sets due to the availability of highly optimized implementations that approximate the construction of ensemble members and can simultaneously learn from labeled and unlabeled instances. The state-of-the-art performance in tabular classification tasks, together with its ability to scale and extend, renders it suitable for trade classification.
+\textcite[\checkmark][13--14]{ronenMachineLearningTrade2022} have unparalleled success in classifying trades through random forests. Due to the framing as a probabilistic classification task, random forests are not optimal. This is because decision trees yield poorly calibrated probability estimates caused by limited samples in leaf nodes, which propagate to the ensemble \autocite[\checkmark][356--360]{tanhaSemisupervisedSelftrainingDecision2017}. Gradient boosting is unaffected by this problem, and scales to large data sets due to the availability of highly optimized implementations that approximate the construction of ensemble members and can simultaneously learn from labeled and unlabeled instances. The state-of-the-art performance in tabular classification tasks, together with its ability to scale and extend, renders it suitable for trade classification.
 
 
 \textbf{Deep Neural Networks}
 
-Neural networks have emerged as powerful models for tabular data with several publications claiming to surpass \glspl{GBRT} in terms of performance. For brevity, we focus on two lines of research: regularized networks and attention-based networks, which have accumulated significant interest in the field. A recent overview of tabular deep learning can be found in \textcite[][1--22]{borisovDeepNeuralNetworks2022}.
+Neural networks have emerged as powerful models for tabular data with several publications claiming to surpass \glspl{GBRT} in terms of performance. For brevity, we focus on two lines of research: regularized networks and attention-based networks, which have accumulated significant interest in the field. A recent overview of tabular deep learning can be found in \textcite[\checkmark][1--21]{borisovDeepNeuralNetworks2022}.
 
 \emph{Regularized Networks}
 
-Among the simplest neural networks are \glspl{MLP}, which consists of multiple linear layers with non-linear activation functions in between. \textcite[][9--10]{kadraWelltunedSimpleNets2021} among others, advocate for the use of vanilla \gls{MLP} with an extensive mix of regularization techniques, such as dropout \autocite{srivastavaDropoutSimpleWay} or residual connections \autocite{heDeepResidualLearning2015}, and report performance improvements over complex tabular-specific architectures or \glspl{GBRT}. Regularization is expected to enhance generalization performance, but the benefit is non-exclusive to \gls{MLP}. Conversely, when regularization is equally applied to tabular-specific architectures, the effect reverses and multiple works including \textcites[][7]{gorishniyRevisitingDeepLearning2021}[][5]{grinsztajnWhyTreebasedModels2022} suggest that regularized \gls{MLP} actually trail the performance of specialized tabular-specific architectures. Also, \glspl{MLP} are rotatinally-invariant learners, as showed in \textcite[][5]{grinsztajnWhyTreebasedModels2022}, which contradicts our reasoning from above. To meet our performance criterion we instead focus on specialized architectures, particularly attention-based networks, while still emphasizing the importance of a careful regularization and optimization.
+Among the simplest neural networks are \glspl{MLP}, which consists of multiple linear layers with non-linear activation functions in between. \textcite[\checkmark][23936--23937]{kadraWelltunedSimpleNets2021} among others, advocate for the use of vanilla \gls{MLP} with an extensive mix of regularization techniques, such as dropout \autocite[\checkmark][1930]{srivastavaDropoutSimpleWay} or residual connections \autocite[\checkmark][2]{heDeepResidualLearning2015}, and report performance improvements over complex tabular-specific architectures or \glspl{GBRT}. Regularization is expected to enhance generalization performance, but the benefit is non-exclusive to \gls{MLP}. Conversely, when regularization is equally applied to tabular-specific architectures, the effect reverses and multiple works including \textcites[\checkmark][18938]{gorishniyRevisitingDeepLearning2021}[][5]{grinsztajnWhyTreebasedModels2022} suggest that regularized \gls{MLP} trail the performance of specialized tabular-specific architectures. Also, \glspl{MLP} are rotationally-invariant learners, as shown in \textcite[\checkmark][511]{grinsztajnWhyTreebasedModels2022}, which contradicts our reasoning from above. To meet our performance criterion we instead focus on specialized architectures, particularly attention-based networks, while still emphasizing the importance of careful regularization and optimization.
 
 \emph{Attention-based Networks}
 
-Another emerging strand of research focuses on neural networks with an attention mechanism. Attention, intuitively, allows gathering information from the immediate context and learn relationships between features or between features and instances. It is incorporated in various architectures, including the tree-like TabNet \autocite[][3--5]{arikTabnetAttentiveInterpretable2020}, and several Transformer-based architectures including TabTransformer \autocite[][2--3]{huangTabTransformerTabularData2020}, Self-Attention and Intersample Attention Transformer \autocite[][4--5]{somepalliSaintImprovedNeural2021}, Non-Parametric Transformer \autocite[][3--4]{kossenSelfAttentionDatapointsGoing2021}, and FT-Transformer \autocite[][4--5]{gorishniyRevisitingDeepLearning2021}.
+Another emerging strand of research focuses on neural networks with an attention mechanism. Attention, intuitively, allows gathering information from the immediate context and learning relationships between features or between features and instances. It is incorporated in various architectures, including the tree-like TabNet \autocite[\checkmark][3--5]{arikTabnetAttentiveInterpretable2020}, and several Transformer-based architectures including TabTransformer \autocite[\checkmark][2--3]{huangTabTransformerTabularData2020}, Self-Attention and Intersample Attention Transformer \autocite[\checkmark][4--5]{somepalliSaintImprovedNeural2021}, Non-Parametric Transformer \autocite[\checkmark][28745--28746]{kossenSelfAttentionDatapointsGoing2021}, and FT-Transformer \autocite[\checkmark][18935--18936]{gorishniyRevisitingDeepLearning2021}.
 
-TabNet \autocite[][3--5]{arikTabnetAttentiveInterpretable2020}, fuses the concept of decision trees with attention. Similar to growing a decision tree, several sub-networks are used to process the input in a sequential, hierarchical fashion. Sequential attention, a variant of attention, is used to decide which features to select in each step. The output of TabNet is the aggregate of all sub-networks. Its poor performance in independent comparisons e.g., \textcites[][7]{kadraWelltunedSimpleNets2021}[][7]{gorishniyRevisitingDeepLearning2021}, raises doubts about its usefulness.
+TabNet \autocite[\checkmark][3--5]{arikTabnetAttentiveInterpretable2020}, fuses the concept of decision trees with attention. Similar to growing a decision tree, several sub-networks are used to process the input in a sequential, hierarchical fashion. Sequential attention, a variant of attention, is used to decide which features to select in each step. The output of TabNet is the aggregate of all sub-networks. Its poor performance in independent comparisons e.g., \textcites[][23934]{kadraWelltunedSimpleNets2021}[\checkmark][18398]{gorishniyRevisitingDeepLearning2021}, raises doubts about its usefulness.
 
-The Self-Attention and Intersample Attention Transformer uses a specialized attention mechanism, the intersample attention, to perform attention over both columns and rows \autocite[][4--5]{somepalliSaintImprovedNeural2021}. Applied to our setting, the model would contextualize information from the trade itself, but also from neighboring trades, which is an unfair advantage over classical trade classification rules. Similarly, the Non-Parametric Transformer of \textcite[][3--4]{kossenSelfAttentionDatapointsGoing2021} uses the entire data set as a context, which rules out the application in our work.
+The Self-Attention and Intersample Attention Transformer uses a specialized attention mechanism, the intersample attention, to perform attention over both columns and rows \autocite[\checkmark][4--5]{somepalliSaintImprovedNeural2021}. Applied to our setting, the model would contextualize information from the trade itself, but also from neighboring trades, which is an unfair advantage over classical trade classification rules considering split orders or complex trades. Similarly, the Non-Parametric Transformer of \textcite[\checkmark][3--4]{kossenSelfAttentionDatapointsGoing2021} uses the entire data set as a context, which rules out the application in our work.
 
-Differently, TabTransformer \autocite[][2--3]{huangTabTransformerTabularData2020} performs attention per sample on categorical features-only. All numerical features are processed in a separate stream, a \gls{MLP}, which breaks correlations between categorical and numerical features \autocite[][2]{somepalliSaintImprovedNeural2021}. Most importantly though, most features in trade datasets are numerical. As such, trade classification would hardly profit from the Transformer architecture, causing the model to collapse to a vanilla \gls{MLP}. A more comprehensive approach is provided by \textcite[][4--5]{gorishniyRevisitingDeepLearning2021} in the form of FT-Transformer, that processes both numerical inputs and categorical input in Transformer blocks featuring an attention mechanism. Since it achieved state-of-the-art performance in independent empirical studies, like \textcite[][5]{grinsztajnWhyTreebasedModels2022}, and is non-rotationally invariant, we further consider FT-Transformer in our empirical study. Being based on the Transformer architecture, FT-Transformer naturally scales to large amounts of data and can utilize unlabeled data through self-training procedures.
+Differently, TabTransformer \autocite[\checkmark][2--3]{huangTabTransformerTabularData2020} performs attention per sample on categorical features only. All numerical features are processed in a separate stream, a \gls{MLP}, which breaks correlations between categorical and numerical features \autocite[\checkmark][2]{somepalliSaintImprovedNeural2021}. Most importantly though, most features in trade datasets are numerical. As such, trade classification would hardly profit from the Transformer architecture, causing the model to collapse to a vanilla \gls{MLP}. A more comprehensive approach is provided by \textcite[\checkmark][18935--18936]{gorishniyRevisitingDeepLearning2021} in the form of FT-Transformer, that processes both numerical inputs and categorical input in Transformer blocks featuring an attention mechanism. Since it achieved state-of-the-art performance in independent empirical studies, like \textcite[\checkmark][511]{grinsztajnWhyTreebasedModels2022}, and is non-rotationally invariant, we further consider FT-Transformer in our empirical study. Being based on the Transformer architecture, FT-Transformer naturally scales to large amounts of data and can utilize unlabeled data through self-training procedures.
 
-The findings of \textcite[][50]{ronenMachineLearningTrade2022} do not support the use of neural networks in trade classification. But due to the lack of details regarding the model architecture, regularization techniques, and training insights, it is necessary to reevaluate these findings in the context of option trades.
+The findings of \textcite[\checkmark][50]{ronenMachineLearningTrade2022} do not support the use of neural networks in trade classification. But due to the lack of details regarding the model architecture, regularization techniques, and training insights, it is necessary to reevaluate these findings in the context of option trades.
 
 To summarize, our study considers gradient boosting and the FT-Transformer, each trained on labeled or partially-labeled trades. This comparison is particularly appealing, as it enables a multi-faceted comparison of wide tree-based ensembles versus deep neural networks, as well as supervised versus semi-supervised methods.
 
@@ -69,22 +69,22 @@ \subsection{Gradient Boosted Trees}\label{sec:gradient-boosted-trees}
 
 \subsubsection{Decision Tree}\label{sec:decision-tree}
 
-Decision trees can be used in classification and regression. Despite solving a classification task, our focus is solely on regression trees, as it is the prevailing prediction model used in the gradient boosting algorithm \autocite[][9]{friedmanAdditiveLogisticRegression2000}. For this section, assume $y_i \in \mathbb{R}$.
+Decision trees can be used in classification and regression. Despite solving a classification task, our focus is solely on regression trees, as it is the prevailing prediction model used in the gradient boosting algorithm \autocite[\checkmark][1198--1199]{friedmanGreedyFunctionApproximation2001}. For this section, assume $y_i \in \mathbb{R}$.
 
-A decision tree splits the feature space into several disjoint regions $R$ through a sequence of recursive splits. For a binary decision tree, a single split leads to two new sub-regions, whose shape is determined by the features considered for splitting and the preceding splits. Trees are grown in depth until a minimum threshold for the number of samples within a node or some other stopping criterion applies \autocite[][42]{breimanClassificationRegressionTrees2017}.
-A region corresponds to a terminal node in the tree. For each terminal node of the tree or unsplit region, the predicted response value is constant for the entire region and shared by all its samples \autocite[][229]{breimanClassificationRegressionTrees2017}.
+A decision tree splits the feature space into several disjoint regions $R$ through a sequence of recursive splits. For a binary decision tree, a single split leads to two new sub-regions, whose shape is determined by the features considered for splitting and the preceding splits. Trees are grown in depth until a minimum threshold for the number of samples within a node or some other stopping criterion applies \autocite[\checkmark][42]{breimanClassificationRegressionTrees2017}.
+A region corresponds to a terminal node in the tree. For each terminal node of the tree or unsplit region, the predicted response value is constant for the entire region and shared by all its samples \autocite[\checkmark][229]{breimanClassificationRegressionTrees2017}.
 
 For a tree with $J$ regions $R_1, R_2,\ldots, R_J$, and some numerical input $\mathbf{x}$ the tree can be modeled as:
 \begin{equation}
     h(\mathbf{x})=\sum_{j=1}^{J} \gamma_{j} \mathbb{I}\left(\mathbf{x} \in R_{j}\right),
     \label{eq:decision-tree}
 \end{equation}
-where $\mathbb{I}$ is the indicator function for region conformance and $\gamma_j$ the region's constant \autocite[][326]{hastietrevorElementsStatisticalLearning2009}. In the regression case, $\gamma_j$ is the mean of all target variables $y_i$ in the specific region. Since all samples of a region share a common response value, the tree estimates resemble a histogram that approximates the true regression surface, as visualized in \cref{fig:decision-boundary-dt}.
+where $\mathbb{I}$ is the indicator function for region conformance and $\gamma_j$ the region's constant \autocite[\checkmark][307]{hastietrevorElementsStatisticalLearning2009}. In the regression case, $\gamma_j$ is the mean of all target variables $y_i$ in the specific region. Since all samples of a region share a common response value, the tree estimates resemble a histogram that approximates the true regression surface, as visualized in \cref{fig:decision-boundary-dt}.
 
 \begin{figure}[ht]
     \centering
     \includegraphics{decision-boundary-dt.pdf}
-    \caption[Approximation of Decision Tree]{Approximation of a Decision Tree. A single split is performed at $\approx\num{3.1327505111694336}$ using the feature. All datapoints left respecitvely right of the split point share the region and are approximated by the region's response value.}
+    \caption[Approximation of Decision Tree]{Approximation of a Decision Tree. A single split is performed at $\approx\num{3.1327505111694336}$ using the feature. All data points left respectively right of the split point share the region and are approximated by the region's response value.}
     \label{fig:decision-boundary-dt}
 \end{figure}
 
@@ -92,7 +92,7 @@ \subsubsection{Decision Tree}\label{sec:decision-tree}
 \begin{equation}
     \operatorname{L}_{\mathrm{SSE}} =\sum_{\mathbf{x}_{i} \in R_j}\left(y_{i}-\gamma_{j}\right)^{2},
 \end{equation}
-which is subsequently minimized \autocite[][231]{breimanClassificationRegressionTrees2017}. As documented in \textcite[][326]{hastietrevorElementsStatisticalLearning2009} we start with the entire dataset and scan through all combinations of features and possible split values. For a split by the feature $k$ at the value $s$, the child nodes are given by a pair of half-planes:
+which is subsequently minimized \autocite[\checkmark][231]{breimanClassificationRegressionTrees2017}. As documented in \textcite[\checkmark][307]{hastietrevorElementsStatisticalLearning2009} we start with the entire dataset and scan through all combinations of features and possible split values. For a split by the feature $k$ at the value $s$, the child nodes are given by a pair of half-planes:
 \begin{equation}
     R_1(k, s)=\left\{X \mid X_k \leq s\right\} \text { and } R_2(k, s)=\left\{X \mid X_k>s\right\}.
 \end{equation}
@@ -102,17 +102,17 @@ \subsubsection{Decision Tree}\label{sec:decision-tree}
 \end{equation}
 Clearly, growing deeper trees leads to an improvement in the \gls{SSE}. Considering the extreme, where each sample has its region, the tree would achieve a perfect fit in-sample but perform poorly on out-of-sample data. To reduce the sensitivity of the tree to changes in the training data, hence \emph{variance}, size complexity pruning procedures are employed. Likewise, if the decision tree is too simplistic, a high bias contributes to the model's overall expected error. Both extremes are to be avoided.
 
-Ensemble methods decrease the expected error of the decision tree by combining multiple trees in a single model through minimizing the bias or variance term or both. Specifically, boosting addresses the bias and variance \autocites[][1672]{schapireBoostingMarginNew1998}[][29]{breimanRandomForests2001}. Next, we focus on \gls{GBRT}, a variant of boosting.
+Ensemble methods decrease the expected error of the decision tree by combining multiple trees in a single model through minimizing the bias or variance term or both. Specifically, boosting addresses the bias and variance \autocites[\checkmark][1672]{schapireBoostingMarginNew1998}[\checkmark][29]{breimanRandomForests2001}. Next, we focus on \gls{GBRT}, a variant of boosting.
 
 \subsubsection{Gradient Boosting
     Procedure}\label{sec:gradient-boosting-procedure}
 
-Gradient boosting iteratively combines oversimplified models, the weak learners, into an additive model to obtain an improved ensemble estimate. This chapter draws on \textcite[][9]{friedmanGreedyFunctionApproximation2001} to derive gradient boosting for binary classification.
+Gradient boosting iteratively combines oversimplified models, the weak learners, into an additive model to obtain an improved ensemble estimate. This chapter draws on \textcite[\checkmark][1198--1199]{friedmanGreedyFunctionApproximation2001} to derive gradient boosting for binary classification.
 
 % classifier with outputs in [-1, 1]
 By \cref{sec:problem-framing} we perform binary probabilistic classification and by \cref{sec:trade-initiator} we defined the labels to be $y \in \{-1,1\}$. For gradient boosting, instead of modeling the class-conditional probabilities directly, we model the conditional log odds instead, which can be interpreted as the probability of observing class $1$ or a buyer-initiated trade, and covert to class-conditional probabilities as needed.
 
-Following \textcite[][9]{friedmanStochasticGradientBoosting2002} we set the loss function to be the cross-entropy loss, given by:
+Following \textcite[\checkmark][1198]{friedmanStochasticGradientBoosting2002} we set the loss function to be the cross-entropy loss, given by:
 \begin{equation}
     L_{\mathrm{BCE}} \colon \mathbb{R}^2 \to \mathbb{R} \quad L_{\mathrm{BCE}}(y, F) = \log(1+\exp(-2yF))
     \label{eq:cross-entropy-loss}
@@ -133,14 +133,14 @@ \subsubsection{Gradient Boosting
 \begin{equation}
     r_i=-\left[\frac{\partial L_{\mathrm{BCE}}\left(y_i, F\left(\mathbf{x}_i\right)\right)}{\partial F\left(\mathbf{x}_i\right)}\right]_{F(\mathbf{x})=F_{m-1}(\mathbf{x})}=2 y_i /\left(1+\exp \left(2 y_i F_{m-1}\left(\mathbf{x}_i\right)\right)\right).
 \end{equation}
-\todo{yields the maximum decrease are similar to the components of the negative gradient descent. However, the  major drawback is that the gradient is only defined for data points xi seen during training, contradicting the creation of a generalizing model.}
-Typically, regression trees (cp. \cref{sec:decision-tree}) are chosen as weak learners since they are computationally cheap and can produce continuous estimates for the residual. The $m$-th regression tree contains $J$ terminal regions, denoted by $R_{j m}, j=1,2, \ldots, J_{m}$. We search for an estimate $\gamma_{j,m}$ for the terminal node $R_{jm}$ that minimizes the cross-entropy over all samples within the node:
+
+Residuals or gradients are only defined on training samples. For a generalized model, residuals are approximated. Typically, regression trees (cp. \cref{sec:decision-tree}) are chosen as weak learners since they are computationally cheap and can produce continuous estimates for the residual. The $m$-th regression tree contains $J$ terminal regions, denoted by $R_{j m}, j=1,2, \ldots, J_{m}$. We search for an estimate $\gamma_{j,m}$ for the terminal node $R_{jm}$ that minimizes the cross-entropy over all samples within the node:
 \begin{equation}
     \gamma_{j m}=\arg \min _\gamma \sum_{\mathbf{x}_i \in R_{j m}} \log \left(1+\exp \left(-2 y_i\left(F_{m-1}\left(\mathbf{x}_i\right)+\gamma\right)\right)\right)
     \label{eq:region-estimate-gbm}
 \end{equation}
 
-\cref{eq:region-estimate-gbm} cannot be solved in closed form and is typically approached by the Newton-Raphson method with a second-order approximation of the loss~\footnote{Compare the second-order Taylor polynomial given by \todo{complete?}.}:
+\cref{eq:region-estimate-gbm} cannot be solved in closed form and is typically approached by the Newton-Raphson method with a second-order approximation of the loss:
 \begin{equation}
     \gamma_{j m}=\sum_{\mathbf{x}_i \in R_{j m}} r_i / \sum_{\mathbf{x}_i \in R_{j m}}\left|r_i\right|\left(2-\left|r_i\right|\right)
 \end{equation}
@@ -150,11 +150,11 @@ \subsubsection{Gradient Boosting
 \begin{equation}
     F_{m}(\mathbf{x})=F_{m-1}(\mathbf{x})+\eta \sum_{j=1}^{J_{m}} \gamma_{j m} \mathbb{I}\left(\mathbf{x} \in R_{j m}\right).
 \end{equation}
-After $M$ iterations we obtain the final estimate calculated as $F_{M}\left(\mathbf{x}\right)$. To avoid \gls{overfitting} the residuals, only proportional steps towards the negative gradient are taken, which is controlled by the learning rate \eta~\autocite[][13]{friedmanGreedyFunctionApproximation2001}. The learning rate \eta~and the size of the ensemble $M$ are deeply intertwined and best tuned together \autocite[][13]{friedmanGreedyFunctionApproximation2001}.
+After $M$ iterations we obtain the final estimate calculated as $F_{M}\left(\mathbf{x}\right)$. To avoid \gls{overfitting} the residuals, only proportional steps towards the negative gradient are taken, which is controlled by the learning rate \eta~\autocite[\checkmark][1203]{friedmanGreedyFunctionApproximation2001}. The learning rate \eta~and the size of the ensemble $M$ are deeply intertwined and best tuned together \autocite[1203--1204][13]{friedmanGreedyFunctionApproximation2001}.
 
-Gradient boosting is still prone to \gls{overfitting} due to fitting trees to point-wise gradients. One solution is to employ early stopping, whereby the ensemble is only grown in size, as long as adding more weak learners leads to a decrease in loss on the validation set \autocite[][384]{hastietrevorElementsStatisticalLearning2009}. Another approach is to limit the amount of data seen during training by fitting trees on random subset of samples, as proposed in \textcite[][3]{friedmanStochasticGradientBoosting2002}, or on a subset of features, as popularized by \textcite[][3]{chenXGBoostScalableTree2016}. \textcite[][6]{prokhorenkovaCatBoostUnbiasedBoosting2018} grow oblivious trees, which use the same splitting criterion for all nodes of one level in a tree. The rationale is, that these arguably simplistic trees, and achieve an imperfect fit, which regularizes the model. Finally, the loss function can be extended for a $\ell_2$ regularization term to penalize the model for complexity \autocite[][2]{chenXGBoostScalableTree2016}.
+Gradient boosting is still prone to \gls{overfitting} due to fitting trees to point-wise gradients. One solution is to employ early stopping, whereby the ensemble is only grown in size, as long as adding more weak learners leads to a decrease in loss on the validation set \autocite[\checkmark][365]{hastietrevorElementsStatisticalLearning2009}. Another approach is to limit the amount of data seen during training by fitting trees on a random subset of samples, as proposed in \textcite[\checkmark][369]{friedmanStochasticGradientBoosting2002}, or on a subset of features, as popularized by \textcite[\checkmark][787]{chenXGBoostScalableTree2016}. \textcite[\checkmark][6644]{prokhorenkovaCatBoostUnbiasedBoosting2018} grow oblivious trees, which use the same splitting criterion for all nodes of one level in a tree. The rationale is, that these arguably simplistic trees, and achieve an imperfect fit, which regularizes the model. Finally, the loss function can be extended for a $\ell_2$ regularization term to penalize the model for complexity \autocite[\checkmark][786]{chenXGBoostScalableTree2016}.
 
-In recent years, several variants of gradient boosting have been proposed and studied in the literature, including CatBoost \autocite[][1--23]{prokhorenkovaCatBoostUnbiasedBoosting2018}, XGBoost \autocite[][1--13]{chenXGBoostScalableTree2016}, and LightGBM \autocite[][3]{keLightGBMHighlyEfficient2017}, which differ by the policy how trees are grown and how \gls{overfitting} is addressed. Performance-wise, differences between the implementations are negligible, as empirical studies suggest \autocites[][8]{grinsztajnWhyTreebasedModels2022}[][19--20]{gorishniyRevisitingDeepLearning2021}[][7]{somepalliSaintImprovedNeural2021}[][14]{borisovDeepNeuralNetworks2022}.
+In recent years, several variants of gradient boosting have been proposed and studied in the literature, including CatBoost \autocite[\checkmark][6639--6649]{prokhorenkovaCatBoostUnbiasedBoosting2018}, XGBoost \autocite[\checkmark][785--794]{chenXGBoostScalableTree2016}, and LightGBM \autocite[\checkmark][785--794]{keLightGBMHighlyEfficient2017}, which differ by the policy how trees are grown and how \gls{overfitting} is addressed. Performance-wise, differences between the implementations are negligible, as empirical studies suggest \autocites[][8]{grinsztajnWhyTreebasedModels2022}[][19--20]{gorishniyRevisitingDeepLearning2021}[][7]{somepalliSaintImprovedNeural2021}[][14]{borisovDeepNeuralNetworks2022}.
 
 As we noted at the beginning, $F_M(\mathbf{x})$ models the log odds. We can recover the class-conditional probabilities $\widehat{\operatorname{Pr}}(y \mid \mathbf{x})$ by taking the inverse:
 \begin{equation}
@@ -168,19 +168,19 @@ \subsection{Transformer Networks}\label{sec:transformer-networks}
 
 \subsubsection{Architectural Overview}\label{sec:architectural-overview}
 
-The Transformer is a neural network architecture by \textcite[][2--6]{vaswaniAttentionAllYou2017} proposed for sequence-to-sequence modeling. Its original application is in machine translation, whereby sentences in the source language are translated into sentences in the target language. More precisely, the sentence is first decomposed into individual \glspl{token} and mapped into a sequence of \glspl{embedding}, which are rich vector representations of the raw input. The Transformer then processes the \glspl{embedding} to generate the output sequence.
+The Transformer is a neural network architecture by \textcite[\checkmark][6002--6006]{vaswaniAttentionAllYou2017} proposed for sequence-to-sequence modeling. Its original application is in machine translation, whereby sentences in the source language are translated into sentences in the target language. More precisely, the sentence is first decomposed into individual \glspl{token} and mapped into a sequence of \glspl{embedding}, which are rich vector representations of the raw input. The Transformer then processes the \glspl{embedding} to generate the output sequence.
 
 As the network operates on \glspl{embedding}, rather than strings, the architecture is not constrained to process textual data. It has been adapted to other modalities including image data \autocites[][2--5]{parmarImageTransformer2018}[][3]{dosovitskiyImageWorth16x162021} and tabular data \autocite[cp.][4]{gorishniyRevisitingDeepLearning2021}. The latter is important for our work, as derived in \cref{sec:selection-of-approaches}.
 
-Following the architecture for machine translation of \textcite[][3]{sutskeverSequenceSequenceLearning2014}, the network features two main components: the encoder and the decoder. A sequence of \glspl{token} is first mapped to a sequence of \glspl{embedding} and augmented with positional information. The encoder receives these \glspl{embedding} and creates an enriched representation from it by encoding the context in which the input appears i.e., the surrounding words. The output of the encoder is then fed to the decoder. The decoder takes the embedded target sequence along with parts of the encoded representation of the input, to autoregressively generate the output sequence, i.e., the translation in the target language \gls{token} by \gls{token} \autocite[][3]{vaswaniAttentionAllYou2017}. \cref{fig:transformer-architecture-overview} depicts the complete architecture and serves as a guide through the subsequent sub-chapters.
+Following the architecture for machine translation of \textcite[\checkmark][3106]{sutskeverSequenceSequenceLearning2014}, the network features two main components: the encoder and the decoder. A sequence of \glspl{token} is first mapped to a sequence of \glspl{embedding} and augmented with positional information. The encoder receives these \glspl{embedding} and creates an enriched representation from it by encoding the context in which the input appears i.e., the surrounding words. The output of the encoder is then fed to the decoder. The decoder takes the embedded target sequence along with parts of the encoded representation of the input, to autoregressively generate the output sequence, i.e., the translation in the target language \gls{token} by \gls{token} \autocite[\checkmark][6003]{vaswaniAttentionAllYou2017}. \cref{fig:transformer-architecture-overview} depicts the complete architecture and serves as a guide through the subsequent sub-chapters.
 
-The encoder consists of $\gls{L}=6$ stacked Transformer blocks \autocite[][6]{vaswaniAttentionAllYou2017}. Each block itself is composed of two sub-layers: a multi-head self-attention layer, followed by a position-wise, \gls{feed-forward-network}. Both components serve a distinct purpose in the Transformer. The self-attention mechanism encodes the context in which the input appears onto the \glspl{embedding}, whereas the \gls{feed-forward-network} serves as a long-term memory persisting information outside the immediate context. In the multi-head self-attention mechanism of the encoder, inputs can learn from any \gls{token} of the input sequence, even if a \gls{token} appears causally before the other input. Each of the sub-layers is surrounded by skip connections \autocite[][2]{heDeepResidualLearning2015} and followed by layer normalization \autocite[][4]{baLayerNormalization2016} to facilitate and stabilize training. Stacking multiple Transformer blocks enables the model to learn hierarchical features from the inputs and targets. Applied to language processing, the first layers in the stack extract coarse-grained syntactic features, and subsequent layers learn fine-grained semantic features \autocites[][3651]{jawaharWhatDoesBERT2019}[][4596]{tenneyBERTRediscoversClassical2019}. For tabular data, this translates to frequent feature combinations or infrequent feature interactions.
+The encoder consists of $\gls{L}=6$ stacked Transformer blocks \autocite[\checkmark][6009]{vaswaniAttentionAllYou2017}. Each block itself is composed of two sub-layers: a multi-head self-attention layer, followed by a position-wise, \gls{feed-forward-network}. Both components serve a distinct purpose in the Transformer. The self-attention mechanism encodes the context in which the input appears onto the \glspl{embedding}, whereas the \gls{feed-forward-network} serves as a long-term memory persisting information outside the immediate context. In the multi-head self-attention mechanism of the encoder, inputs can learn from any \gls{token} of the input sequence, even if a \gls{token} appears causally before the other input. Each of the sub-layers is surrounded by skip connections \autocite[\checkmark][2]{heDeepResidualLearning2015} and followed by layer normalization \autocite[\checkmark][4]{baLayerNormalization2016} to facilitate and stabilize training. Stacking multiple Transformer blocks enables the model to learn hierarchical features from the inputs and targets. Applied to language processing, the first layers in the stack extract coarse-grained syntactic features, and subsequent layers learn fine-grained semantic features \autocites[][3651]{jawaharWhatDoesBERT2019}[][4596]{tenneyBERTRediscoversClassical2019}. For tabular data, this translates to frequent feature combinations or infrequent feature interactions.
 
 Aside from the feed-forward sub-layer, the decoder contains a sub-layer for multi-head self-attention on the output of the encoder, known as cross-attention, and a masked variant of the multi-head self-attention for use on the output sequence. Here, causal masking enforces the autoregressive properties of the decoder.
 
-The output of the decoder is finally passed through a linear layer with a softmax activation function to unembed the output and retrieve the probabilities of the next \gls{token} \autocite[][5]{vaswaniAttentionAllYou2017}. Since the output sequence is generated autoregressively, the most probable \gls{token} is fed back as input to the decoder to provide context for the following \glspl{token} until the remaining sequence is generated.
+The output of the decoder is finally passed through a linear layer with a softmax activation function to unembed the output and retrieve the probabilities of the next \gls{token} \autocite[\checkmark][6005]{vaswaniAttentionAllYou2017}. Since the output sequence is generated autoregressively, the most probable \gls{token} is fed back as input to the decoder to provide context for the following \glspl{token} until the remaining sequence is generated.
 
-For its original application, machine translation, both the encoder and decoder are used. Yet, the modular design allows adapting Transformers to a wider range of use cases, some of which only require the encoder or decoder. \textcite[][16--17]{raffelExploringLimitsTransfer2020} differentiate these modes: encoder-only architecture, which encodes the input to obtain an enriched representation, decoder-only architectures to generate new \glspl{token} and encoder-decoder models for sequence-to-sequence modeling autoregressively. As our focus is on the probabilistic classification of tabular data, the goal is to learn an enriched representation of the input for classifying the label, here $\gls{y}$, rather than generating new samples. As such, encoder-only Transformers suffice. This insight also guides the structure in the next chapters, which focus on \glspl{embedding} and the inner workings of the encoder.
+For its original application, machine translation, both the encoder and decoder are used. Yet, the modular design allows adapting Transformers to a wider range of use cases, some of which only require the encoder or decoder. \textcite[\checkmark][16--17]{raffelExploringLimitsTransfer2020} differentiate these modes: encoder-only architecture, which encodes the input to obtain an enriched representation, decoder-only architectures to generate new \glspl{token} and encoder-decoder models for sequence-to-sequence modeling autoregressively. As our focus is on the probabilistic classification of tabular data, the goal is to learn an enriched representation of the input for classifying the label, here $\gls{y}$, rather than generating new samples. As such, encoder-only Transformers suffice. This insight also guides the structure in the next chapters, which focus on \glspl{embedding} and the inner workings of the encoder.
 
 \begin{landscape}
     \begin{figure}[ht]
@@ -188,26 +188,24 @@ \subsubsection{Architectural Overview}\label{sec:architectural-overview}
         {\renewcommand\normalsize{\scriptsize}%
             \normalsize
             \input{./Graphs/transformer-architecture.pdf_tex}}
-        \caption[Overview Over the Transformer Architecture]{Overview over the Transformer Architecture. The left part shows the self-attention mechanism discussed in \cref{sec:attention}. The central part depicts the multi-head self-attention mechanism, as covered in \cref{sec:attention}. The right part shows the encoder and decoder stack, as well as the \gls{embedding} mechanism as covered in \cref{sec:token-embeddings} onwards. Own work inspired by \textcite[][3]{tayEfficientTransformersSurvey2022}.}
+        \caption[Overview Over the Transformer Architecture]{Overview over the Transformer Architecture. The left part shows the self-attention mechanism discussed in \cref{sec:attention}. The central part depicts the multi-head self-attention mechanism, as covered in \cref{sec:attention}. The right part shows the encoder and decoder stack, as well as the \gls{embedding} mechanism as covered in \cref{sec:token-embeddings} onwards. Visualization inspired by \textcite[\checkmark][3]{tayEfficientTransformersSurvey2022}.}
         \label{fig:transformer-architecture-overview}
     \end{figure}
 \end{landscape}
 
 \subsubsection{Token Embedding}\label{sec:token-embeddings}
 
-As explained previously, Transformers operate on sequences of numeric vector representations, the \emph{token embeddings}. The classical Transformer was trained on \emph{word embeddings}. Nevertheless, \gls{token} embeddings are generic and arbitrary inputs that can be embedded and then processed by the Transformer. In the spirit of \textcite[][5]{vaswaniAttentionAllYou2017}, we first explore word embeddings for textual data, before adapting embeddings to the tabular domain.
-
-\todo{write down, how the sequence of token ids is constructed.}
+As explained previously, Transformers operate on sequences of numeric vector representations, the \emph{token embeddings}. The classical Transformer was trained on \emph{word embeddings}. Nevertheless, \gls{token} embeddings are generic and arbitrary inputs that can be embedded and then processed by the Transformer. In the spirit of \textcite[\checkmark][6005]{vaswaniAttentionAllYou2017}, we first explore word embeddings for textual data, before adapting embeddings to the tabular domain.
 
 \textbf{Embeddings For Textual Data}
 
-To obtain \gls{token} embeddings from the raw input sequences i.e., a sentence, the sequence is first split into constituent vocabulary elements, the \emph{tokens}. All known \glspl{token} are stored in a vocabulary. The vocabulary $V$ consists of $N_{V}=|V|$ elements and maps \glspl{token} onto their unique integer keys, referred to as \emph{token-ids} \autocite[][3]{phuongFormalAlgorithmsTransformers2022}. Apart from \glspl{token} in the training corpus, the vocabulary may include special \glspl{token}, like the $\mathtt{[UNK]}$ \gls{token} to handle out-of-vocabulary items or $\mathtt{[CLS]}$ \gls{token} for storing an aggregate representation of the sequence for classification \autocite[cp.][4]{devlinBERTPretrainingDeep2019}.
+To obtain \gls{token} embeddings from the raw input sequences i.e., a sentence, the sequence is first split into constituent vocabulary elements, the \emph{tokens}. All known \glspl{token} are stored in a vocabulary. The vocabulary $V$ consists of $N_{V}=|V|$ elements and maps \glspl{token} onto their unique integer keys, referred to as \emph{token-ids} \autocite[\checkmark][3]{phuongFormalAlgorithmsTransformers2022}. Apart from \glspl{token} in the training corpus, the vocabulary may include special \glspl{token}, like the $\mathtt{[UNK]}$ \gls{token} to handle out-of-vocabulary items or $\mathtt{[CLS]}$ \gls{token} for storing an aggregate representation of the sequence for classification \autocite[cp.][4]{devlinBERTPretrainingDeep2019}.
 
-For ease of explanation, we equate \glspl{token} with words.\footnote{There is a subtle difference between \glspl{token} and words. A \gls{token} can be words including punctuation marks. But words can also be split into multiple \glspl{token}, such as sub-words \autocite[][3]{bojanowskiEnrichingWordVectors2017} or characters. To decrease the size of the vocabulary, words may be reduced to their stems, lower-cased, and stop words be removed.} Consider the following example with a small vocabulary of $V=\left\{1, N_v\right\}$ and a mapping between the \gls{token} and token-id of $\mathrm{'queen'}\mapsto 1$; $\mathrm{'king'}\mapsto 2$. For the sample sequence »Kings and Queens«, the sequence of token-ids is $\mathbf{s}=[2, 1]$, after applying tokenizing by words and common pre-processing like lower-casing, and the removal of the stop word »and«. Arbitrary sequences are given by $\mathbf{s} \equiv s[1: \ell] \equiv$ $s[1] s[2] \ldots s[\ell] \in V^*$.
+For ease of explanation, we equate \glspl{token} with words.\footnote{There is a subtle difference between \glspl{token} and words. A \gls{token} can be words including punctuation marks. But words can also be split into multiple \glspl{token}, such as sub-words \autocite[\checkmark][137]{bojanowskiEnrichingWordVectors2017} or characters. To decrease the size of the vocabulary, words may be reduced to their stems, lower-cased, and stop words be removed.} Consider the following example with a small vocabulary of $V=\left\{1, N_v\right\}$ and a mapping between the \gls{token} and token-id of $\mathrm{'queen'}\mapsto 1$; $\mathrm{'king'}\mapsto 2$. For the sample sequence »Kings and Queens«, the sequence of token-ids is $\mathbf{s}=[2, 1]$, after applying tokenizing by words and common pre-processing like lower-casing, and the removal of the stop word »and«. Arbitrary sequences are given by $\mathbf{s} \equiv s[1: \ell] \equiv$ $s[1] s[2] \ldots s[\ell] \in V^*$.
 
-The conversion to token-ids, however, loses the semantics, as token-ids may be assigned arbitrarily or ordering by semantics may not be feasible. This limitation can be overcome by embeddings, as pioneered by \textcite[][1139]{bengioNeuralProbabilisticLanguage}, which map each token-id into a high-dimensional space. By representing words as a vector, semantic and syntactic relationships between tokens can be encoded. As such, related words share a similar embedding vector \autocite[][1139]{bengioNeuralProbabilisticLanguage}. Moreover, word embeddings are semantically meaningful and can capture linguistic regularities, like gender through offsets between vectors \autocite[][748--749]{mikolovLinguisticRegularitiesContinuous2013}.
+The conversion to token-ids, however, loses the semantics, as token-ids may be assigned arbitrarily or ordering by semantics may not be feasible. This limitation can be overcome by embeddings, as pioneered by \textcite[\checkmark][1139]{bengioNeuralProbabilisticLanguage}, which map each token-id into a high-dimensional space. By representing words as a vector, semantic and syntactic relationships between tokens can be encoded. As such, related words share a similar embedding vector \autocite[\checkmark][1139]{bengioNeuralProbabilisticLanguage}. Moreover, word embeddings are semantically meaningful and can capture linguistic regularities, like gender through offsets between vectors \autocite[\checkmark][748--749]{mikolovLinguisticRegularitiesContinuous2013}.
 
-The embedding layer from \cref{fig:transformer-architecture-overview} is ultimately a lookup table to retrieve the embedding vector $\gls{e} \in \mathbb{R}^{d_{e}}$ from a learned embedding matrix $\gls{W-e} \in \mathbb{R}^{d_{e} \times N_{V}}$ with the token-id $v \in V \cong\left[N_{V}\right]$ as shown:\footnote{Throughout our discussion on Transformers we adopt a notation proposed in \textcite[][1--16]{phuongFormalAlgorithmsTransformers2022}.}
+The embedding layer from \cref{fig:transformer-architecture-overview} is ultimately a lookup table to retrieve the embedding vector $\gls{e} \in \mathbb{R}^{d_{e}}$ from a learned embedding matrix $\gls{W-e} \in \mathbb{R}^{d_{e} \times N_{V}}$ with the token-id $v \in V \cong\left[N_{V}\right]$ as shown:\footnote{Throughout our discussion on Transformers we adopt a notation proposed in \textcite[\checkmark][1--16]{phuongFormalAlgorithmsTransformers2022}.}
 \begin{equation}
     \gls{e}=\gls{W-e}\left[:, v\right].
     \label{eq:word-embeddings}
@@ -227,16 +225,16 @@ \subsubsection{Token Embedding}\label{sec:token-embeddings}
 
 \textbf{Embeddings For Numerical Data}
 
-Transformer networks can handle numerical features, such as the trade price, by mapping the scalar value to a high-dimensional embedding vector and process sequences thereof \autocite[][3]{gorishniyEmbeddingsNumericalFeatures2022}. In the simplest case, a learned linear projection is utilized to obtain the embedding. Linear embeddings of numerical features were previously explored in \textcites[][3]{kossenSelfAttentionDatapointsGoing2021}[][4]{somepalliSaintImprovedNeural2021}[][4]{gorishniyRevisitingDeepLearning2021}.
+Transformer networks can handle numerical features, such as the trade price, by mapping the scalar value to a high-dimensional embedding vector and process sequences thereof \autocite[\checkmark][24993]{gorishniyEmbeddingsNumericalFeatures2022}. In the simplest case, a learned linear projection is utilized to obtain the embedding. Linear embeddings of numerical features were previously explored in \textcites[][3]{kossenSelfAttentionDatapointsGoing2021}[][4]{somepalliSaintImprovedNeural2021}[][4]{gorishniyRevisitingDeepLearning2021}.
 
 In analogon to the word case, if the $m$-th feature, $\mathbf{x}[m]$, is numerical, it is projected to its embedding $\gls{e} \in \mathbb{R}^{d_e}$ by element-wise multiplication with a learned vector $\mathbf{W}_m \in \mathbb{R}^{d_{e}}$. Moreover, a feature-dependent bias term $\mathbf{b}_m \in \mathbb{R}^{d_{e}}$ is added, as noted in \cref{eq:numerical-embeddings}.
 \begin{equation}
     \gls{e}= \mathbf{W}_m \mathbf{x}[m] +\mathbf{b}_m
     \label{eq:numerical-embeddings}
 \end{equation}
-More sophisticated approaches rely on parametric embeddings, like the \emph{piece-wise linear encoding} or the \emph{periodic encoding} of \textcite[][10]{gorishniyEmbeddingsNumericalFeatures2022}. Both enforce non-linearity. The authors show that these can alleviate the model's performance but at a non-neglectable computational cost. For this reason, our focus is on the computational more tractable linear embedding.
+More sophisticated approaches rely on parametric embeddings, like the \emph{piece-wise linear encoding} or the \emph{periodic encoding} of \textcite[\checkmark][24993--24995]{gorishniyEmbeddingsNumericalFeatures2022}. Both enforce non-linearity. The authors show that these can alleviate the model's performance but at a non-neglectable computational cost. For this reason, our focus is on the computational more tractable linear embedding.
 
-More generally, the works of \textcites[][1]{gorishniyEmbeddingsNumericalFeatures2022}[][1]{somepalliSaintImprovedNeural2021} suggest, that numerical embedding can significantly improve robustness to missing values or noise. Their works miss a theoretical explanation. \textcite[][8--9]{grinsztajnWhyTreebasedModels2022} fill this void and attribute the increased robustness to the broken rotational invariance.
+More generally, the works of \textcites[\checkmark][24997]{gorishniyEmbeddingsNumericalFeatures2022}[\checkmark][3--4]{somepalliSaintImprovedNeural2021} suggest, that numerical embedding can significantly improve robustness to missing values or noise. Their works miss a theoretical explanation. \textcite[][]{grinsztajnWhyTreebasedModels2022} fill this void and attribute the increased robustness to the broken rotational invariance.
 
 \textbf{Embeddings For Categorical Data}
 
@@ -244,27 +242,26 @@ \subsubsection{Token Embedding}\label{sec:token-embeddings}
 \autocites[][4]{gorishniyRevisitingDeepLearning2021}[][2]{huangTabTransformerTabularData2020}[][4]{somepalliSaintImprovedNeural2021}. Analogous, each category is mapped to an embedding vector using a learned embedding matrix. Due to the heterogeneous nature of tabular data, embeddings may not be shared between features.
 
 For categorical inputs, the embedding is implemented as a lookup table, analogous to \cref{eq:word-embeddings}. However, each feature has
-its vocabulary $C_t$ with $N_{C_m}$ categories. Assume, the $m$-th feature is categorical. The specific embeddings $\gls{e}$ are queried with a unique integer key $c_{m} \in C_m \cong\left[N_{C_t}\right]$ from the learned embedding matrix $\mathbf{W}_m \in \mathbb{R}^{d_e \times N_{C_m}}$. Finally, a feature-specific bias term $\mathbf{b}_m \in \mathbb{R}^{d_{e}}$ is added as shown in \cref{eq:categorical-embeddings}. Like for the word case, all embeddings of an instance are gathered in $\mathbf{S}$.
+its vocabulary $C_t$ with $N_{C_m}$ categories. Assume, the $m$-th feature is categorical. The specific embeddings $\gls{e}$ are queried with a unique integer key $c_{m} \in C_m \cong\left[N_{C_t}\right]$ from the learned embedding matrix $\mathbf{W}_m \in \mathbb{R}^{d_e \times N_{C_m}}$. Finally, a feature-specific bias term $\mathbf{b}_m \in \mathbb{R}^{d_{e}}$ is added as shown in \cref{eq:categorical-embeddings}. As for the word case, all embeddings of an instance are gathered in $\mathbf{S}$.
 \begin{equation}
     \gls{e}=\mathbf{W}_m[:,c_{m}] +\mathbf{b}_m
     \label{eq:categorical-embeddings}
 \end{equation}
 These categorical embeddings can potentially capture the intrinsic properties of categorical variables by arranging similar categories closer in the embedding space. For instance, consider the underlyings $\mathtt{GOOGL}$ (Alphabet Inc.), $\mathtt{MSFT}$ (Microsoft Inc.), and $\mathtt{K}$ (Kellogg Company). Due to the overlapping field of operations, one would anticipate greater similarity between Alphabet and Microsoft.
 
-Despite these advantages, high-cardinal features present a challenge for embeddings since they are typically learned from a few samples, which promotes \gls{overfitting}. Handling high-dimensional categorical data remains an open research problem, as noted by \textcite[][2]{borisovDeepNeuralNetworks2022}.
+Despite these advantages, high-cardinal features present a challenge for embeddings since they are typically learned from a few samples, which promotes \gls{overfitting}. Handling high-dimensional categorical data remains an open research problem, as noted by \textcite[\checkmark][12]{borisovDeepNeuralNetworks2022}.
 
 \textbf{Link To Positional Encoding and Attention}
 
-\todo{verify invariant property. Not sure if I got it right.}
 Embeddings can only encode the semantic relationship of tokens, but they do not provide a clue to the model about the relative and absolute ordering of tokens in which they appear in the sequence, since all stages of the encoder and decoder are invariant to the token's position. Positional information must be induced into the model to preserve the ordering (cp. \cref{sec:positional-encoding}). Another limitation of embeddings is, that identical tokens share the embedding, even if they are ambiguous and their meaning is different from the context in which they appear. To resolve this issue, embeddings get contextualized in the self-attention mechanism (cp. \cref{sec:attention}).
 
 \subsubsection{Positional Encoding}\label{sec:positional-encoding}
 
-In practice, the order of words is important for the overall meaning of a sentence. As such, \textcite[][6]{vaswaniAttentionAllYou2017} propose to inject information on the \gls{token}'s position within the sequence through a \emph{positional encoding}, that is added onto the \gls{token} embedding.
+In practice, the order of words is important for the overall meaning of a sentence. As such, \textcite[\checkmark][6006]{vaswaniAttentionAllYou2017} propose to inject information on the \gls{token}'s position within the sequence through a \emph{positional encoding}, that is added onto the \gls{token} embedding.
 
 Contrary to sentences, columns in tabular datasets are arranged in an arbitrary order, which weakens the need for positional information. However, unless the embeddings per feature are unique, a positional embedding is also required so that the model can relate the otherwise identical embeddings to specific features and distinguish them \autocites[][3]{huangTabTransformerTabularData2020}[][15]{somepalliSaintImprovedNeural2021}.
 
-Like \gls{token} embeddings, positional embeddings can also be learned \autocite[cp.][4174]{devlinBERTPretrainingDeep2019}. Due to better, extrapolation capabilities, \textcite[][6]{vaswaniAttentionAllYou2017}, propose an positional encoding with the mapping $\gls{W-p}: \mathbb{N} \rightarrow \mathbb{R}^{d_{e}}$ based on sine and cosine signals to encode the \emph{absolute} position of the \gls{token}:
+Like \gls{token} embeddings, positional embeddings can also be learned \autocite[cp.][4174]{devlinBERTPretrainingDeep2019}. Due to better, extrapolation capabilities, \textcite[\checkmark][6006]{vaswaniAttentionAllYou2017}, propose an positional encoding with the mapping $\gls{W-p}: \mathbb{N} \rightarrow \mathbb{R}^{d_{e}}$ based on sine and cosine signals to encode the \emph{absolute} position of the \gls{token}:
 \begin{equation}
     \begin{aligned}
         \gls{W-p}\left[2 i-1, t\right] & =\sin \left(t / \gls{ellmax}^{2 i / \gls{d}_e}\right), \\
@@ -283,9 +280,9 @@ \subsubsection{Positional Encoding}\label{sec:positional-encoding}
 
 The positional encoding is visualized in \cref{fig:positional-embedding}. One can see the alternating pattern between even and odd columns and the unique pattern for each \gls{token}'s position.
 
-Using trigonometric functions for the positional embedding is favorable, due to being zero-centered and resulting in values in the closed range of $[-1,1]$. These properties are long known to promote convergence of neural networks \autocites[][8-9]{lecunEfficientBackProp2012}[][2]{ioffeBatchNormalizationAccelerating2015}.
+Using trigonometric functions for the positional embedding is favorable, due to being zero-centered and resulting in values in the closed range of $[-1,1]$. These properties are long known to promote convergence of neural networks \autocites[][16--17]{lecunEfficientBackProp2012}[][2]{ioffeBatchNormalizationAccelerating2015}.
 
-The reason for encoding with both the sine and cosine is more subtle, as either one would suffice for absolute embeddings. \textcite[][6]{vaswaniAttentionAllYou2017} hypothesize, that besides learning the \emph{absolute} position i.e., fifth place in sequence, providing both sine and cosine also enables the model to attend to \emph{relative} positions, i.e., two places from a given \gls{token}.
+The reason for encoding with both the sine and cosine is more subtle, as either one would suffice for absolute embeddings. \textcite[\checkmark][6006]{vaswaniAttentionAllYou2017} hypothesize, that besides learning the \emph{absolute} position i.e., fifth place in sequence, providing both sine and cosine also enables the model to attend to \emph{relative} positions, i.e., two places from a given \gls{token}.
 
 The positional embedding is finally added per element to the token embedding to form a \gls{token}'s initial embedding $\gls{e}$. For the $\gls{t}$-th \gls{token} of a sequence $\mathbf{s}$, the embedding becomes:
 \begin{equation}
@@ -300,9 +297,9 @@ \subsubsection{Attention Mechanism}\label{sec:attention}
 
 \textbf{Preliminaries}
 
-Attention can be thought of as a mapping between a query and a set of key-value pairs to an output. In general, the current token is first projected onto a query vector, and all tokens in the context are mapped to key and value vectors. Similar to a soft dictionary lookup, the goal is to retrieve the values from tokens in the context for which the keys are similar to the query and return an aggregate estimate of the values weighted by the similarity of the keys and the query. Naturally, if a token in the context is important for predicting the queried token, indicated by a high similarity, the value of the context token has a large contribution to the output \autocites[][5]{phuongFormalAlgorithmsTransformers2022}[][3]{vaswaniAttentionAllYou2017}.
+Attention can be thought of as a mapping between a query and a set of key-value pairs to an output. In general, the current token is first projected onto a query vector, and all tokens in the context are mapped to key and value vectors. Similar to a soft dictionary lookup, the goal is to retrieve the values from tokens in the context for which the keys are similar to the query and return an aggregate estimate of the values weighted by the similarity of the keys and the query. Naturally, if a token in the context is important for predicting the queried token, indicated by a high similarity, the value of the context token has a large contribution to the output \autocites[\checkmark][5]{phuongFormalAlgorithmsTransformers2022}[\checkmark][6003]{vaswaniAttentionAllYou2017}.
 
-Attention first appeared in \textcite[][4]{bahdanauNeuralMachineTranslation2016} and was popularized by \textcite[][4]{vaswaniAttentionAllYou2017}. The latter introduced a specific attention mechanism, known as \emph{scaled dot-product attention}, which we introduce in detail.
+Attention first appeared in \textcite[\checkmark][4]{bahdanauNeuralMachineTranslation2016} and was popularized by \textcite[\checkmark][6004]{vaswaniAttentionAllYou2017}. The latter introduced a specific attention mechanism, known as \emph{scaled dot-product attention}, which we introduce in detail.
 
 \textbf{Scaled Dot-Product Attention}
 
@@ -317,14 +314,13 @@ \subsubsection{Attention Mechanism}\label{sec:attention}
     \end{aligned}
     \label{eq:attention}
 \end{equation}
-where $\mathbf{S} \in \mathbb{R}^{d_s \times \ell_s}$ and $\mathbf{Z} \in \mathbb{R}^{d_z \times \ell_z}$ are vector representations of the primary input sequence and of the context sequence. Both the primary and the context sequences are identical for the encoder but are different for the decoder. The query, key, and value matrices $\mathbf{Q}=\mathbf{W}_q \mathbf{S} + \mathbf{b}_q\mathbf{1}^{\top}$, $\mathbf{K}=\mathbf{W}_k \mathbf{Z} + \mathbf{b}_k\mathbf{1}^{\top}$, and $\mathbf{V}=\mathbf{W}_v \mathbf{Z} + \mathbf{b}_v\mathbf{1}^{\top}$ are linear projections of the input and context sequences, and $\mathbf{W}_q, \mathbf{W}_k \in \mathbb{R}^{d_{\mathrm{attn}\times d_{s}}}$; $\mathbf{W}_v \in \mathbb{R}^{d_{\mathrm{out}\times d_{z}}}$; $\mathbf{b}_q, \mathbf{b}_k \in \mathbb{R}^{d_{\mathrm{attn}}}$, and $\mathbf{b}_v \in \mathbb{R}^{d_{\mathrm{out}}}$ are learnable parameters. The dimensionality of the attention mechanism, $d_{\mathrm{attn}}$, is typically a fraction of the model dimensionality to accelerate computation. Likewise, the output dimension, $d_{out}$, is another hyperparameter to the models. The attention scores are $\mathbf{A}$, which are scaled by $\sqrt{d_{\mathrm{attn}}}$ to avoid unstable gradients, and the softmax activation normalizes all scores. As normalized attention scores have a clear interpretation as the weights of how much a token contributes to the model's output, the attention mechanism provides a window into the model, which we explore in \cref{sec:feature-importance-measure}.
+where $\mathbf{S} \in \mathbb{R}^{d_s \times \ell_s}$ and $\mathbf{Z} \in \mathbb{R}^{d_z \times \ell_z}$ are vector representations of the primary input sequence and the context sequence. Both the primary and the context sequences are identical for the encoder but are different for the decoder. The query, key, and value matrices $\mathbf{Q}=\mathbf{W}_q \mathbf{S} + \mathbf{b}_q\mathbf{1}^{\top}$, $\mathbf{K}=\mathbf{W}_k \mathbf{Z} + \mathbf{b}_k\mathbf{1}^{\top}$, and $\mathbf{V}=\mathbf{W}_v \mathbf{Z} + \mathbf{b}_v\mathbf{1}^{\top}$ are linear projections of the input and context sequences, and $\mathbf{W}_q, \mathbf{W}_k \in \mathbb{R}^{d_{\mathrm{attn}\times d_{s}}}$; $\mathbf{W}_v \in \mathbb{R}^{d_{\mathrm{out}\times d_{z}}}$; $\mathbf{b}_q, \mathbf{b}_k \in \mathbb{R}^{d_{\mathrm{attn}}}$, and $\mathbf{b}_v \in \mathbb{R}^{d_{\mathrm{out}}}$ are learnable parameters. The dimensionality of the attention mechanism, $d_{\mathrm{attn}}$, is typically a fraction of the model dimensionality to accelerate computation. Likewise, the output dimension, $d_{out}$, is another hyperparameter to the models. The attention scores are $\mathbf{A}$, which are scaled by $\sqrt{d_{\mathrm{attn}}}$ to avoid unstable gradients, and the softmax activation normalizes all scores. As normalized attention scores have a clear interpretation as the weights of how much a token contributes to the model's output, the attention mechanism provides a window into the model, which we explore in \cref{sec:feature-importance-measure}.
 
 \textbf{Multi-Head Attention}
 
-Rather than relying on a single attention function, \textcite[][4--5]{vaswaniAttentionAllYou2017} introduce multiple \emph{attention heads}, which perform attention in parallel on $H$ \emph{different} linear projections of queries, keys, and values. The \emph{multi-head attention} enables the model to learn richer representations of the input, as attention heads operate independently, they can pick up unique patterns or focus on different positions in the sequence at once. Multi-head attention is visualized in \cref{fig:transformer-architecture-overview} (center).
+Rather than relying on a single attention function, \textcite[\checkmark][6004--6005]{vaswaniAttentionAllYou2017} introduce multiple \emph{attention heads}, which perform attention in parallel on $H$ \emph{different} linear projections of queries, keys, and values. The \emph{multi-head attention} enables the model to learn richer representations of the input, as attention heads operate independently, they can pick up unique patterns or focus on different positions in the sequence at once. Multi-head attention is visualized in \cref{fig:transformer-architecture-overview} (center).
 
-\todo{introduce word modalities}
-Exemplary for machine translation, \textcite[][5795]{voitaAnalyzingMultiHeadSelfAttention2019} show, that heads serve indeed distinct purposes like learning positional or syntactic relations between tokens. It is conceivable, that for tabular data this maps to dependencies between features. In practice, Transformers may not leverage all attention heads and some heads could even be pruned without impacting the performance \autocites[][9]{michelAreSixteenHeads2019}[][5805]{voitaAnalyzingMultiHeadSelfAttention2019}.
+Exemplary for machine translation, \textcite[\checkmark][5797]{voitaAnalyzingMultiHeadSelfAttention2019} show, that heads serve indeed distinct purposes like learning positional or syntactic relations between tokens. It is conceivable, that for tabular data this maps to dependencies between features. In practice, Transformers may not leverage all attention heads and some heads could even be pruned without impacting the performance \autocites[][9]{michelAreSixteenHeads2019}[\checkmark][5805]{voitaAnalyzingMultiHeadSelfAttention2019}.
 
 Multi-head attention can be computed as:
 
@@ -342,11 +338,11 @@ \subsubsection{Attention Mechanism}\label{sec:attention}
 
 In the \cref{eq:attention}, tokens can attend to any preceding or subsequent token without restrictions. Thus, the full \emph{bidirectional context} is used. This design is optimal for the encoder, where the entire input sequence shall serve as the context.
 
-For the decoder, the self-attention is modified to \emph{masked self-attention} and \emph{cross-attention} mechanism. First, causal masking is required to achieve autoregressive sequence generation in the decoder. The context is \emph{unidirectional}, where a token is only allowed to attend to itself or all previously generated tokens. Second, the decoder uses \emph{cross-attention} to connect between the encoder and decoder. Other than in the self-attention mechanism, where keys, values and queries are generated from the same sequence, keys and values come from the encoder and queries are provided by the decoder. As our focus is on encoder-only architectures, we refer the reader to \textcite[][16--17]{raffelExploringLimitsTransfer2020} for an in-depth treatment of both topics.
+For the decoder, the self-attention is modified to \emph{masked self-attention} and \emph{cross-attention} mechanism. First, causal masking is required to achieve autoregressive sequence generation in the decoder. The context is \emph{unidirectional}, where a token is only allowed to attend to itself or all previously generated tokens. Second, the decoder uses \emph{cross-attention} to connect between the encoder and decoder. Other than in the self-attention mechanism, where keys, values, and queries are generated from the same sequence, keys and values come from the encoder, and queries are provided by the decoder. As our focus is on encoder-only architectures, we refer the reader to \textcite[\checkmark][16--17]{raffelExploringLimitsTransfer2020} for an in-depth treatment of both topics.
 
 \subsubsection{Position-Wise Feed-Forward Networks}\label{sec:position-wise-ffn}
 
-The attention mechanism enables \glspl{token} to attend to other inputs in the immediate context. To retain general information on the task, outside and independent of the immediate context, each Transformer block adds a point-wise \gls{feed-forward-network}, which acts as a persistent memory to the model \autocite[][3]{sukhbaatarAugmentingSelfattentionPersistent2019}.
+The attention mechanism enables \glspl{token} to attend to other inputs in the immediate context. To retain general information on the task, outside and independent of the immediate context, each Transformer block adds a point-wise \gls{feed-forward-network}, which acts as a persistent memory to the model \autocite[\checkmark][3]{sukhbaatarAugmentingSelfattentionPersistent2019}.
 
 The network consists of a linear transformation, followed by a non-linear activation function and a second linear layer. For the $l$-th layer, the \gls{MLP} is given by
 \begin{equation}
@@ -354,46 +350,44 @@ \subsubsection{Position-Wise Feed-Forward Networks}\label{sec:position-wise-ffn}
 \end{equation}
 with $\mathbf{W}_{\mathrm{mlp} 1}^l \in \mathbb{R}^{d_{\mathrm{mlp}} \times d_{e}}, \mathbf{b}_{\mathrm{mlp} 1}^l \in \mathbb{R}^{d_{\mathrm{mlp}}}, \mathbf{W}_{\mathrm{mlp} 2}^l \in \mathbb{R}^{d_{e}} \times d_{\mathrm{mlp}}$ and $\mathbf{b}_{\mathrm{mlp} 2}^l \in \mathbb{R}^{d_{e}}$ being learnable parameters identical for all \glspl{embedding} in the layer. The network is applied to each embedding separately and identically.
 
-\textcite[][9]{vaswaniAttentionAllYou2017} set the hidden dimension to be two to eight magnitudes of the embedding dimension. The large capacity strengthens the model's ability to retain information but also contributes significantly to the high computational requirements and memory footprint of Transformers \autocites[][5]{tayEfficientTransformersSurvey2022}[][1]{kitaevReformerEfficientTransformer2020}. Both linear transformations are separated by a \gls{ReLU} \gls{activation-function} \autocite[][318]{glorotDeepSparseRectifier2011} to introduce non-linearities to the network.
+\textcite[\checkmark][6009]{vaswaniAttentionAllYou2017} set the hidden dimension to be two to eight magnitudes of the embedding dimension. The large capacity strengthens the model's ability to retain information but also contributes significantly to the high computational requirements and memory footprint of Transformers \autocites[\checkmark][4--5]{tayEfficientTransformersSurvey2022}[\checkmark][1]{kitaevReformerEfficientTransformer2020}. Both linear transformations are separated by a \gls{ReLU} \gls{activation-function} \autocite[\checkmark][318]{glorotDeepSparseRectifier2011} to introduce non-linearities to the network.
 
 Like the attention layer, the position-wise \gls{FFN} is surrounded by residual connections, followed by layer normalization (cp. \cref{sec:residual-connections-layer-norm}). Both are vital for the training process and convergence of the overall network. Optionally, dropout is added to prevent the model from \gls{overfitting}.
 
 \subsubsection{Residual Connections and Layer Normalization}\label{sec:residual-connections-layer-norm}
 
-Recall from earlier chapters, that the encoder stacks multiple Transformer blocks, each of which consists of several sub-layers, resulting in a deep network. While depth is inevitable to learn hierarchical representations, the training of such a network is complicated. As neural networks are commonly trained using backpropagation, which relies on the gradient of the error to be propagated through the network starting at the last layer, vanishing or \glspl{exploding-gradient} pose a major difficulty in training deep neural nets \autocite[][1]{heDeepResidualLearning2015}. Without countermeasures, stacking multiple layers in the encoder and decoder of the Transformers impedes the gradient information to flow efficiently through the network and hampers the training behavior \autocite[][1811]{wangLearningDeepTransformer2019}.
+Recall from earlier chapters, that the encoder stacks multiple Transformer blocks, each of which consists of several sub-layers, resulting in a deep network. While depth is inevitable to learn hierarchical representations, the training of such a network is complicated. As neural networks are commonly trained using backpropagation, which relies on the gradient of the error to be propagated through the network starting at the last layer, vanishing or \glspl{exploding-gradient} pose a major difficulty in training deep neural nets \autocite[\checkmark][1]{heDeepResidualLearning2015}. Without countermeasures, stacking multiple layers in the encoder and decoder of the Transformers impedes the gradient information to flow efficiently through the network and hampers the training behavior \autocite[\checkmark][1811]{wangLearningDeepTransformer2019}.
 
-As a remedy, \textcite[][3]{vaswaniAttentionAllYou2017} employ residual connections around each sub-layer, whereby the output of the sub-layer is added element-wisely to its input. Intuitively, the residual connection provides an alternative path for information to flow through the network, since some information can bypass the sub-layer and thereby reach deeper layers within the stack. Vanishing or \glspl{exploding-gradient} are also mitigated, as gradients can bypass the sub-layer, eventually contributing towards an easier optimization \autocite[][3591]{liuRethinkingSkipConnection2020}. Residual connections moreover help to preserve the positional embeddings (cp. \cref{sec:positional-encoding}), as the layer's inputs are maintained in the identity mapping. Another technique to improve the training behavior is layer normalization.
+As a remedy, \textcite[\checkmark][6003]{vaswaniAttentionAllYou2017} employ residual connections around each sub-layer, whereby the output of the sub-layer is added element-wisely to its input. Intuitively, the residual connection provides an alternative path for information to flow through the network, since some information can bypass the sub-layer and thereby reach deeper layers within the stack. Vanishing or \glspl{exploding-gradient} are also mitigated, as gradients can bypass the sub-layer, eventually contributing towards an easier optimization \autocite[\checkmark][3591]{liuRethinkingSkipConnection2020}. Residual connections moreover help to preserve the positional embeddings (cp. \cref{sec:positional-encoding}), as the layer's inputs are maintained in the identity mapping. Another technique to improve the training behavior is layer normalization.
 
-\textcite[][3]{vaswaniAttentionAllYou2017} extensively draw on layer normalization \autocite[][4]{baLayerNormalization2016} after the multi-head attention and feed-forward sub-layers. It is used for normalizing the activations of the sub-layer and to stabilize and accelerate the training of the network \autocite[][2]{baLayerNormalization2016}. The normalization statistics are calculated separately for every instance, which guarantees scalability across different batch sizes.
+\textcite[\checkmark][6003]{vaswaniAttentionAllYou2017} extensively draw on layer normalization \autocite[\checkmark][4]{baLayerNormalization2016} after the multi-head attention and feed-forward sub-layers. It is used for normalizing the activations of the sub-layer and to stabilize and accelerate the training of the network \autocite[\checkmark][2]{baLayerNormalization2016}. The normalization statistics are calculated separately for every instance, which guarantees scalability across different batch sizes.
 
-Until now it remains unclear, how the layer normalization intertwines with the sub-layers and the residual connections. Transformers are distinguished by the order in which layer normalization is added into the pre-norm and post-norm Transformer. Post-norm Transformers add layer normalization to the sub-layer \emph{after} adding the input from the residual connections. The arrangement is depicted in \cref{fig:transformer-architecture-overview}. In contrast for pre-norm Transformers, the normalization is applied \emph{before} the self-attention and feed-forward sub-layers and inside the residual connections. Pre-norm requires one additional normalization layer to pass only well-conditioned outputs from the Transformer block to the successive layers \autocite[][5]{xiongLayerNormalizationTransformer2020}.
+Until now it remains unclear, how the layer normalization intertwines with the sub-layers and the residual connections. Transformers are distinguished by the order in which layer normalization is added into the pre-norm and post-norm Transformer. Post-norm Transformers add layer normalization to the sub-layer \emph{after} adding the input from the residual connections. The arrangement is depicted in \cref{fig:transformer-architecture-overview}. In contrast for pre-norm Transformers, the normalization is applied \emph{before} the self-attention and feed-forward sub-layers and inside the residual connections. Pre-norm requires one additional normalization layer to pass only well-conditioned outputs from the Transformer block to the successive layers \autocite[\checkmark][10528]{xiongLayerNormalizationTransformer2020}.
 
-\textcite[][3]{vaswaniAttentionAllYou2017} employ post-layer normalization, but recent research has shown a shift towards pre-norm setups \autocite[][4]{narangTransformerModificationsTransfer2021}. Parts of the widespread adaption lie in faster training and omitting of the need for costly learning rate warm-up stages, whereby the learning rate is initially decreased to keep the gradients balanced \autocites[][2]{xiongLayerNormalizationTransformer2020}[][8]{liuUnderstandingDifficultyTraining2020}. In addition, post-norm Transformers have been found brittle to train and prone to convergence failures with its root cause in vanishing gradients, \glspl{exploding-gradient}, and an overall higher dependency on the residual stream \autocites[][8]{liuUnderstandingDifficultyTraining2020}[][1812]{wangLearningDeepTransformer2019}. Pre-norm Transformers, although they may sacrifice some performance, introduce a certain robustness to the training process. We come back to this property in our discussion on the FT-Transformer.
+\textcite[\checkmark][6003]{vaswaniAttentionAllYou2017} employ post-layer normalization, but recent research has shown a shift towards pre-norm setups \autocite[][4]{narangTransformerModificationsTransfer2021}. Parts of the widespread adaption lie in faster training and omitting of the need for costly learning rate warm-up stages, whereby the learning rate is initially decreased to keep the gradients balanced \autocites[][2]{xiongLayerNormalizationTransformer2020}[][8]{liuUnderstandingDifficultyTraining2020}. In addition, post-norm Transformers have been found brittle to train and prone to convergence failures with its root cause in vanishing gradients, \glspl{exploding-gradient}, and an overall higher dependency on the residual stream \autocites[][8]{liuUnderstandingDifficultyTraining2020}[][1812]{wangLearningDeepTransformer2019}. Pre-norm Transformers, although they may sacrifice some performance, introduce a certain robustness to the training process. We come back to this property in our discussion on the FT-Transformer.
 
 \subsubsection{FT-Transformer}\label{sec:fttransformer}
 
-\todo{try to introduce BERT here somewhere.}
-
-Many of the previous concepts can be adapted to the tabular domain with minor architectural changes. \textcite[][5]{gorishniyRevisitingDeepLearning2021} propose with FT-Transformer an adaption, that pairs an embedding unit for both numerical and categorical inputs, dubbed the feature tokenizer, with a Transformer. The complete architecture is depicted in \cref{fig:fttransformer}. Notably, the Transformer units use a pre-norm setup for easier optimization, whereby the very first normalization layer in the encoder is removed due to a propitious performance \textcite[][17]{gorishniyRevisitingDeepLearning2021}. The upstream feature tokenizer transforms every feature in $\mathbf{x}$ to their embeddings. The embeddings are given by \cref{eq:numerical-embeddings,eq:categorical-embeddings}.
+Many of the previous concepts can be adapted to the tabular domain with minor architectural changes. \textcite[\checkmark][18935]{gorishniyRevisitingDeepLearning2021} propose with FT-Transformer an adaption, that pairs an embedding unit for both numerical and categorical inputs, dubbed the feature tokenizer, with a Transformer. The complete architecture is depicted in \cref{fig:fttransformer}. The encoder-only architecture is inspired by \gls{BERT} of \textcite[\checkmark][4173--4174]{devlinBERTPretrainingDeep2019}. Transformer units use a pre-norm setup for easier optimization, whereby the very first normalization layer in the encoder is removed due to a propitious performance \textcite[\checkmark][18948]{gorishniyRevisitingDeepLearning2021}. The upstream feature tokenizer transforms every feature in $\mathbf{x}$ to their embeddings. The embeddings are given by \cref{eq:numerical-embeddings,eq:categorical-embeddings}.
 
 \begin{figure}[ht]
     \centering
     {\renewcommand\normalsize{\scriptsize}
         \normalsize
         \input{./Graphs/fttransformer.pdf_tex}}
-    \caption[Overview Over the FT-Transformer Architecture]{Overview Over the Architecture of FT-Transformer. The FT-Transformer uses a pre-norm arrangement and operates on numerical and categorical embeddings. Own work inspired by \textcite[][4--5]{gorishniyRevisitingDeepLearning2021}.}
+    \caption[Overview Over the FT-Transformer Architecture]{Overview Over the Architecture of FT-Transformer. The FT-Transformer uses a pre-norm arrangement and operates on numerical and categorical embeddings. Visualization inspired by \textcite[\checkmark][18935--18936]{gorishniyRevisitingDeepLearning2021}.}
     \label{fig:fttransformer}
 \end{figure}
 
-Recall from our discussion on self-attention (cp. \cref{sec:attention}), that each \gls{token} encodes the \glspl{token} within the sequence. Based on this notion, \textcite[][4174]{devlinBERTPretrainingDeep2019} prepend a specialized $\mathtt{[CLS]}$ \gls{token} to the sequence, which stores the sequence's aggregate representation. Like any other \gls{token}, the $\mathtt{[CLS]}$ \gls{token} is embedded first and contextualized in the encoder. Its final hidden state is then used for classification.
+Recall from our discussion on self-attention (cp. \cref{sec:attention}), that each \gls{token} encodes the \glspl{token} within the sequence. Based on this notion, \textcite[\checkmark][4174]{devlinBERTPretrainingDeep2019} prepend a specialized $\mathtt{[CLS]}$ \gls{token} to the sequence, which stores the sequence's aggregate representation. Like any other \gls{token}, the $\mathtt{[CLS]}$ \gls{token} is embedded first and contextualized in the encoder. Its final hidden state is then used for classification.
 
-\textcite[][4]{gorishniyRevisitingDeepLearning2021} adapt the idea of a $\mathtt{[CLS]}$ \gls{token} for tabular representation models. Similar to the embeddings of categorical or numerical features, the embedding of the $[\mathtt{CLS}]$ \gls{token} $\gls{e}_\mathtt{[CLS]} \in \mathbb{R}^{d_{e}}$ is prepended to the column embeddings with $\mathbf{S} = \left[\gls{e}_\mathtt{[CLS]}, \gls{e}_1, \ldots \gls{e}_{M}\right]$, where $\mathbf{S} \in \mathbb{R}^{d_{e} \times M +1}$. Like before, $\mathbf{S}$ is passed through a stack of Transformer layers. The updated representation of the $\mathtt{[CLS]}$ \gls{token} is used exclusively for prediction:
+\textcite[\checkmark][18935]{gorishniyRevisitingDeepLearning2021} adapt the idea of a $\mathtt{[CLS]}$ \gls{token} for tabular representation models. Similar to the embeddings of categorical or numerical features, the embedding of the $[\mathtt{CLS}]$ \gls{token} $\gls{e}_\mathtt{[CLS]} \in \mathbb{R}^{d_{e}}$ is prepended to the column embeddings with $\mathbf{S} = \left[\gls{e}_\mathtt{[CLS]}, \gls{e}_1, \ldots \gls{e}_{M}\right]$, where $\mathbf{S} \in \mathbb{R}^{d_{e} \times M +1}$. Like before, $\mathbf{S}$ is passed through a stack of Transformer layers. The updated representation of the $\mathtt{[CLS]}$ \gls{token} is used exclusively for prediction and passed to a classification head:
 \begin{equation}
     P=\operatorname{Linear}\left(\operatorname{ReLU}\left(\operatorname{LayerNorm}\left(\mathbf{S}\left[:,0\right]\right)\right)\right).
     \label{eq:bert-ft}
 \end{equation}
 \todo{Add softmax, think about ReLU, change the linear layer to Weight matrix?}
 
-\textcite[][8]{gorishniyRevisitingDeepLearning2021} achieve state-of-the-art performance through numerical and categorical embeddings. Embedding both categorical and numerical inputs enables the Transformer to attend to all other features, but at a considerable computational cost, that may only be justified by higher classification accuracies.
+\textcite[\checkmark][18939]{gorishniyRevisitingDeepLearning2021} achieve state-of-the-art performance through numerical and categorical embeddings. Embedding both categorical and numerical inputs enables the Transformer to attend to all other features, but at a considerable computational cost, that may only be justified by higher classification accuracies.
 
-Next, all models are extended for learning on partially-labeled data.
\ No newline at end of file
+Next, all previous models are extended for learning on partially-labeled data.
\ No newline at end of file
diff --git a/reports/Content/training-tuning.tex b/reports/Content/training-tuning.tex
index 14992353..b4e0af42 100644
--- a/reports/Content/training-tuning.tex
+++ b/reports/Content/training-tuning.tex
@@ -5,12 +5,12 @@ \subsection{Training and Tuning}\label{sec:training-and-tuning}
 \subsubsection{Training of Supervised
     Models}\label{sec:training-of-supervised-models}
 
-Our implementation of \glspl{GBRT} is based on CatBoost \autocite[][5--6]{prokhorenkovaCatBoostUnbiasedBoosting2018} because of its efficient implementation on \glspl{GPU} and native support for categorical variables. However, as discussed in \cref{sec:gradient-boosting-procedure}, we expect the chosen library to have minimal impact on performance.
+Our implementation of \glspl{GBRT} is based on CatBoost \autocite[\checkmark][5--6]{prokhorenkovaCatBoostUnbiasedBoosting2018} because of its efficient implementation on \glspl{GPU} and native support for categorical variables. However, as discussed in \cref{sec:gradient-boosting-procedure}, we expect the chosen library to have minimal impact on performance.
 
 \begin{figure}[ht]
     \centering
     \includegraphics{gbm-train-val-loss-acc.pdf}
-    \caption[Training and Validation Accuracy of Gradient-Boosting]{Training and validation accuracy of \gls{GBRT} on \gls{ISE} sample. Metrics are estimated on the classical feature set. One iteration corresponds to an additional regression tree added to the ensemble. Loss is expected to decrease for more complex ensembles and accuracy to increase.}
+    \caption[Training and Validation Accuracy of Gradient-Boosting]{Training and validation accuracy of \gls{GBRT} on \gls{ISE} sample. Metrics are estimated on the feature set classic. One iteration corresponds to an additional regression tree added to the ensemble. Loss is expected to decrease for more complex ensembles and accuracy to increase.}
     \label{fig:gbm-train-val-loss-acc}
 \end{figure}
 
@@ -36,19 +36,19 @@ \subsubsection{Training of Supervised
     \label{fig:gbm-optimizations-loss-acc}
 \end{figure}
 
-We leverage several architectural changes to reduce the loss, further improve performance and mitigate overfitting in gradient boosting, as shown in \cref{fig:gbm-optimizations-loss-acc}, where the effects on validation accuracy and log loss over the default configuration are visualized. Following standard practice, e.g., \textcite[][]{tuningplaybookgithub}, all other parameters are kept at their default values, while a single parameter is varied to derive the plots. Although this approach ignores parameter interactions, it still can guide the optimal training configuration. We train on the \gls{ISE} training set with classical features and report metrics on the validation set.
+We leverage several architectural changes to reduce the loss, further improve performance and mitigate overfitting in gradient boosting, as shown in \cref{fig:gbm-optimizations-loss-acc}, where the effects on validation accuracy and log loss over the default configuration are visualized. Following standard practice, e.g., \textcite[\checkmark][]{tuningplaybookgithub}, all other parameters are kept at their default values, while a single parameter is varied to derive the plots. Although this approach ignores parameter interactions, it still can guide the optimal training configuration. We train on the \gls{ISE} training set with classical features and report metrics on the validation set.
 
 \emph{Growth Strategy}
 
-To improve performance, we switch to a leaf-wise growth strategy, following \textcite[][4]{chenXGBoostScalableTree2016}. By default, CatBoost grows oblivious regression trees, which are symmetric and grown level-wise. In this strategy, splits are performed on the same feature and split values across all nodes of a single level, which is computationally efficient but may compromise performance. In contrast, leaf-wise growth selects terminal nodes that provide the largest improvement in the loss, potentially leading to nodes within the same level being split with different features and values, resulting in a closer fit to the data. Leaf-wise growth also aligns with the intuition of split finding from \cref{sec:decision-tree}. This change improves validation accuracy by \SI{0.3461}{\percent} but has little effect on the loss.
+To improve performance, we switch to a leaf-wise growth strategy, following \textcite[\checkmark][786]{chenXGBoostScalableTree2016}. By default, CatBoost grows oblivious regression trees, which are symmetric and grown level-wise. In this strategy, splits are performed on the same feature and split values across all nodes of a single level, which is computationally efficient but may compromise performance. In contrast, leaf-wise growth selects terminal nodes that provide the largest improvement in the loss, potentially leading to nodes within the same level being split with different features and values, resulting in a closer fit to the data. Leaf-wise growth also aligns with the intuition of split finding from \cref{sec:decision-tree}. This change improves validation accuracy by \SI{0.3461}{\percent} but has little effect on the loss.
 
 \emph{Sample Weighting}
 
-The work of \textcite[][36--38]{grauerOptionTradeClassification2022} suggests a strong temporal shift in the data, with the performance of classical trade classification rules deteriorating over time.  As a result, the predictability of features derived from these rules diminishes over time, and patterns learned from old observations become less relevant for predicting test samples. To address this, we introduce a sample weighting scheme that assigns higher weights to recent training samples and gradually decays weights over time, which we incorporate into the log loss. Validation and test samples are equally weighted. Sample weighting proves to be essential for achieving high validation performance, and it positively impacts the accuracy and confidence in the prediction mitigating the problem from above.
+The work of \textcite[\checkmark][35]{grauerOptionTradeClassification2022} suggests a strong temporal shift in the data, with the performance of classical trade classification rules deteriorating over time.  As a result, the predictability of features derived from these rules diminishes over time, and patterns learned from old observations become less relevant for predicting test samples. To address this, we introduce a sample weighting scheme that assigns higher weights to recent training samples and gradually decays weights over time, which we incorporate into the log loss. Validation and test samples are equally weighted. Sample weighting proves to be essential for achieving high validation performance, and it positively impacts the accuracy and confidence in the prediction mitigating the problem from above.
 
 \emph{Border Count}
 
-In regression trees of gradient-boosting, the split finding is typically approximated with quantization, whereby all numeric features are first discretised into a fixed number of buckets through histogram building, and splits are evaluated at the border of the buckets \autocite[][2]{keLightGBMHighlyEfficient2017}. To increase the number of split candidates, we raise the border count to \num{254}. Generally, this leads to increased accuracy at the cost of computational efficiency. Yet, in the experiment above, the improvements in validation loss and accuracy are minor compared to the previous modifications.
+In regression trees of gradient-boosting, the split finding is typically approximated with quantization, whereby all numeric features are first discretized into a fixed number of buckets through histogram building, and splits are evaluated at the border of the buckets \autocite[\checkmark][3147]{keLightGBMHighlyEfficient2017}. To increase the number of split candidates, we raise the border count to \num{254}. Generally, this leads to increased accuracy at the cost of computational efficiency. Yet, in the experiment above, the improvements in validation loss and accuracy are minor compared to the previous modifications.
 
 \emph{Early Stopping and Checkpointing}
 
@@ -58,9 +58,9 @@ \subsubsection{Training of Supervised
 
 \textbf{FT-Transformer}
 
-We rely on the FT-Transformer of \textcite[][4--5]{gorishniyRevisitingDeepLearning2021} as our second model. The training of Transformers has been found non-trivial and requires a carefully designed training setup of model, optimizer, and learning rate schedule \autocite[][1]{liuUnderstandingDifficultyTraining2020}. We investigate minor modifications to the default FT-Transformer to stabilize training and improve overall performance. The default FT-Transformer is trained for 10 epochs on \gls{ISE} dataset with classical features and loss and accuracy are visualized in \cref{fig:fttransformer-optimizations-loss-acc}.\footnote{Default configuration documented in \textcite[][18]{gorishniyRevisitingDeepLearning2021}.}
+We rely on the FT-Transformer of \textcite[][18935--18936]{gorishniyRevisitingDeepLearning2021} as our second model. The training of Transformers has been found non-trivial and requires a carefully designed training setup of model, optimizer, and learning rate schedule \autocite[\checkmark][5747]{liuUnderstandingDifficultyTraining2020}. We investigate minor modifications to the default FT-Transformer to stabilize training and improve overall performance. The default FT-Transformer is trained for 10 epochs on \gls{ISE} dataset with classical features and loss and accuracy are visualized in \cref{fig:fttransformer-optimizations-loss-acc}.\footnote{Default configuration documented in \textcite[\checkmark][18949]{gorishniyRevisitingDeepLearning2021}.}
 
-The convergence behavior of our model is similar to that of gradient boosting. Equally, a significant generalization gap exists between the training and validation loss. Particularly concerning, the training loss decreases sharply, while the validation loss spuriously improves over its initial estimate. Despite this, validation accuracy improves throughout the entire training cycle. We reason that the network learns to correctly classify trades, indicated by the improved accuracy, but only attains low-confident correct predictions or confident but erroneous predictions which both contribute to a large validation loss.
+The convergence behavior of our model is similar to that of gradient boosting. Equally, a significant generalization gap exists between the training and validation loss. The training loss decreases sharply, while the validation loss spuriously improves over its initial estimate. Despite this, validation accuracy improves throughout the entire training cycle. We reason that the network learns to correctly classify trades, indicated by the improved accuracy, but only attains low-confident correct predictions or confident but erroneous predictions which both contribute to a large validation loss.
 
 \begin{figure}[!ht]
     \centering
@@ -73,21 +73,21 @@ \subsubsection{Training of Supervised
 
 \emph{Activation Function}
 
-Motivated by previous research, we experiment with replacing the $\operatorname{ReLU}$ activation with the $\operatorname{GELU}$ activation function \autocite[][2]{hendrycksGaussianErrorLinear2020} in the classification head and the gated variant $\operatorname{ReGLU}$ with the gated variant $\operatorname{GEGLU}$ \autocite[][2]{shazeerGLUVariantsImprove2020} in the \gls{FFN}. As visualized in \cref{fig:fttransformer-optimizations-loss-acc}, no advantage in terms of validation accuracy or loss is evident.
+Motivated by previous research, we experiment with replacing the $\operatorname{ReLU}$ activation with the $\operatorname{GELU}$ activation function \autocite[\checkmark][2]{hendrycksGaussianErrorLinear2020} in the classification head and the gated variant $\operatorname{ReGLU}$ with the gated variant $\operatorname{GEGLU}$ \autocite[\checkmark][2]{shazeerGLUVariantsImprove2020} in the \gls{FFN}. As visualized in \cref{fig:fttransformer-optimizations-loss-acc}, no advantage in terms of validation accuracy or loss is evident.
 
 \emph{Sample Weighting}
 
-We apply the concept of sample weighting from \gls{GBRT} to Transformers. Specifically, we scale the contribution of individual training samples to the loss using a sample weight, which penalizes the model for misclassifying recent observations. This method is crucial for achieving low validation loss and high validation accuracies, as visible in \cref{fig:fttransformer-optimizations-loss-acc}. The significantly lower training accuracy implies, that patterns from latter observations do not universally transfer to previous observations. At this time, it remains unclear what is causing the data drift within the training set.
+We apply the concept of sample weighting from \gls{GBRT} to Transformers. Specifically, we scale the contribution of individual training samples to the loss using a sample weight, which penalizes the model for misclassifying recent observations. This method is crucial for achieving low validation loss and high validation accuracies, as visible in \cref{fig:fttransformer-optimizations-loss-acc}. The significantly lower training accuracy implies, that patterns from latter observations do not universally transfer to previous observations. It remains unclear what is causing the data drift within the training set.
 
 \clearpage
 
 \emph{Label Smoothing}
 
-A major problem in classification with neural networks is, that the network becomes overconfident in predicting training samples but performs poorly on unseen data. In \cref{fig:fttransformer-optimizations-loss-acc} the effect is evident, as the increased confidence in the prediction on the training set does not transfer to the validation set. To regularize the network, we experiment with label smoothing \autocite[][2823]{szegedyRethinkingInceptionArchitecture2016} by training on soft labels with an uncertainty constant of $\epsilon$. Instead of assigning hard class probabilities of 0 or 1, we assume that true labels in the training set are correct with $1-\epsilon$ probability and incorrect otherwise. For $\epsilon=\num{0.1}$, a trade with the true label $-1$ is assumed to be \SI{90}{\percent} seller-initiated and \SI{10}{\percent} buyer-initiated. While we observe that label smoothing improves the validation loss and reduces the generalization gap, we find that it has a negligible effect on validation accuracy and therefore abandon this approach.
+A major problem in classification with neural networks is, that the network becomes overconfident in predicting training samples but performs poorly on unseen data. In \cref{fig:fttransformer-optimizations-loss-acc} the effect is evident, as the increased confidence in the prediction on the training set does not transfer to the validation set. To regularize the network, we experiment with label smoothing \autocite[\checkmark][2823]{szegedyRethinkingInceptionArchitecture2016} by training on soft labels with an uncertainty constant of $\epsilon$. Instead of assigning hard class probabilities of 0 or 1, we assume that true labels in the training set are correct with $1-\epsilon$ probability and incorrect otherwise. For $\epsilon=\num{0.1}$, a trade with the true label $-1$ is assumed to be \SI{90}{\percent} seller-initiated and \SI{10}{\percent} buyer-initiated. While we observe that label smoothing improves the validation loss and reduces the generalization gap, we find that it has a negligible effect on validation accuracy and therefore abandon the approach.
 
 \emph{Learning Rate Schedule}
 
-When training Transformers, the learning rate is often adjusted throughout the training process. \textcite[][7]{vaswaniAttentionAllYou2017} use a learning rate warm-up period, whereby the learning rate is linearly increased in the early stages of training, followed by decay using an inverse square root learning rate schedule. The warm-up phase is thought to stabilize gradients as weight updates are considerably smaller. According to \cref{sec:residual-connections-layer-norm}, learning rate warm-up is crucial for training post-norm Transformers, but optional for pre-norm Transformers like the FT-Transformer. Nevertheless, we experiment with the effect of learning rate warm-up in our setting and combine a linear warm-up for two epochs with subsequent cosine decay, as visualized in \cref{fig:lr-lin-warmup-cosine-decay}.
+When training Transformers, the learning rate is often adjusted throughout the training process. \textcite[][6007]{vaswaniAttentionAllYou2017} use a learning rate warm-up period, whereby the learning rate is linearly increased in the early stages of training, followed by decay using an inverse square root learning rate schedule. The warm-up phase is thought to stabilize gradients as weight updates are considerably smaller. According to \cref{sec:residual-connections-layer-norm}, learning rate warm-up is crucial for training post-norm Transformers, but optional for pre-norm Transformers like the FT-Transformer. Nevertheless, we experiment with the effect of learning rate warm-up in our setting and combine a linear warm-up for two epochs with subsequent cosine decay, as visualized in \cref{fig:lr-lin-warmup-cosine-decay}.
 
 \begin{figure}[!ht]
     \centering
@@ -96,13 +96,13 @@ \subsubsection{Training of Supervised
     \label{fig:lr-lin-warmup-cosine-decay}
 \end{figure}
 
-The scheduled learning rate has soothing effects on the training loss and accuracy estimates, as evident in \cref{fig:fttransformer-optimizations-loss-acc}. Therefore, we adopt a training setup with a learning rate schedule, despite the negative effects on training time. The learning rate itself is tuned as part of \cref{sec:hyperparameter-tuning}.
+The scheduled learning rate smooths the training loss and accuracy estimates, as evident in \cref{fig:fttransformer-optimizations-loss-acc}. Therefore, we adopt a training setup with a learning rate schedule, despite the negative effects on training time. The learning rate itself is tuned as part of \cref{sec:hyperparameter-tuning}.
 
 
 \emph{Batch Size}
 
 % 20 epochs (\num{36460} / \num{145840} iterations) 
-We use a fixed batch size of \num{8192} samples for the feature set classic/size and \num{2048} for the feature set option, which is the largest possible size on our \gls{GPU}. Training is performed for \num{20} epochs at maximum. All samples within the training and validation set are shuffled randomly to promote convergence. Although a smaller batch size could enhance the generalization capabilities of the model, as found in \textcite[][3]{keskarLargeBatchTrainingDeep2017}, we train on the largest number of trades per iteration, to optimize throughput. Additional regularization is added to the model, but treated as a tunable hyperparameter.
+We use a fixed batch size of \num{8192} samples for the feature set classic/size and \num{2048} for the feature set option, which is the largest possible size on our \gls{GPU}. Training is performed for \num{20} epochs at maximum. All samples within the training and validation set are shuffled randomly to promote convergence. Although a smaller batch size could enhance the generalization capabilities of the model, as found in \textcite[\checkmark][3]{keskarLargeBatchTrainingDeep2017}, we train on the largest number of trades per iteration, to optimize throughput. Additional regularization is added to the model, but treated as a tunable hyperparameter.
 
 \emph{Early Stopping and Checkpointing}
 
@@ -110,15 +110,17 @@ \subsubsection{Training of Supervised
 
 \emph{Optimizer}
 
-In line with \textcite[][6]{gorishniyRevisitingDeepLearning2021}, we train the models using the AdamW optimizer \autocite[][2--3]{loshchilovDecoupledWeightDecay2019} with the standard hyperparameters.\footnote{Parameters $\beta_{1}=0.9, \beta_{2}=0.999$, and $\epsilon = \num{1e-8}$.} The weight decay coefficient in AdamW determining the degree of regularization is tuned in \cref{sec:hyperparameter-tuning}. Weight decay is selectively applied and excludes embeddings, LayerNorm, and biases.
+In line with \textcite[\checkmark][18937]{gorishniyRevisitingDeepLearning2021}, we train the models using the AdamW optimizer \autocite[\checkmark][2--3]{loshchilovDecoupledWeightDecay2019} with the standard hyperparameters.\footnote{Parameters $\beta_{1}=0.9, \beta_{2}=0.999$, and $\epsilon = \num{1e-8}$.} The weight decay coefficient in AdamW determining the degree of regularization is tuned in \cref{sec:hyperparameter-tuning}. Weight decay is selectively applied and excludes embeddings, LayerNorm, and biases.
 
-In summary, we extend the training setup of \textcite[][6]{gorishniyRevisitingDeepLearning2021} with a sample weighting scheme and learning rate schedule aimed at boosting performance and training stability.
+In summary, we extend the training setup of \textcite[\checkmark][18937]{gorishniyRevisitingDeepLearning2021} with a sample weighting scheme and learning rate schedule aimed at boosting performance and training stability.
+
+\vskip 1.3in
 
 \textbf{Solutions For Classical Rules}
 
 Classical trade classification rules serve as a benchmark in our work. We implement them as a classifier that combines arbitrary trade classification rules through stacking, as covered in \cref{sec:semi-supervised-approaches}.
 
-In cases where classification is not feasible due to missing data or the rules' definition itself, we resort to a random classification, which achieves an average accuracy of \SI{50}{\percent}. The approach is adopted from \textcite[][887]{savickasInferringDirectionOption2003}.
+In cases where classification is not feasible due to missing data or the rules' definition itself, we resort to a random classification, which achieves an average accuracy of \SI{50}{\percent}. The approach is adopted from \textcite[\checkmark][887]{savickasInferringDirectionOption2003}.
 
 \subsubsection{Training of Semi-supervised
     Models}\label{sec:training-of-semi-supervised-models}
@@ -127,32 +129,32 @@ \subsubsection{Training of Semi-supervised
 
 \textbf{Gradient Boosting With Self-Training}
 
-To incorporate unlabeled trades into the training procedure, we combine gradient boosting with a self-training classifier, as derived in \cref{sec:extensions-to-gradient-boosted-trees}. We repeat self-training for 2 iterations and require the predicted class probability to exceed $\tau=0.9$. As the entire ensemble is rebuilt three times, the relatively low number of iterations and high confidence threshold, strike a balance between computational requirements and the need for high-quality predictions. The base classifier is otherwise identical to supervised gradient boosting from \cref{sec:training-of-supervised-models}.
+To incorporate unlabeled trades into the training procedure, we combine gradient boosting with a self-training classifier, as derived in \cref{sec:extensions-to-gradient-boosted-trees}. We repeat self-training for 2 iterations and require the predicted class probability to exceed $\tau=0.9$. As the entire ensemble is rebuilt three times, the low number of iterations and high confidence threshold, strike a balance between computational requirements and the need for high-quality predictions. The base classifier is otherwise identical to supervised gradient boosting from \cref{sec:training-of-supervised-models}.
 
 \textbf{FT-Transformer with Pre-Training}
 
 The FT-Transformer is trained in two stages. First, we train for \num{20} epochs on unlabeled \gls{ISE} trades using the \gls{RTD} head, followed by \num{20} epochs of fine-tuning on labeled \gls{ISE} training data with the binary classification head.
 
-During pre-training and fine-tuning, early stopping is applied based on the value of the objective on the validation set, using patience of \num{10}. This particular setup is adopted from \textcite[][15]{rubachevRevisitingPretrainingObjectives2022} for being compute-efficient and offering competitive performance. The hidden dimension of the classification head is set to \num{512}. Based on \textcite[][3]{clarkElectraPretrainingText2020} \SI{15}{\percent} of all tokens are replaced. 
+During pre-training and fine-tuning, early stopping is applied based on the value of the objective on the validation set, using patience of \num{10}. This particular setup is adopted from \textcite[\checkmark][15]{rubachevRevisitingPretrainingObjectives2022} for being compute-efficient and offering competitive performance on tabular datasets. The hidden dimension of the classification head is set to \num{512}. Based on \textcite[][3]{clarkElectraPretrainingText2020} \SI{15}{\percent} of all tokens are replaced.
 
-Since the unlabeled sample includes various types of trades that may not be comparable to the labeled sample, we update all layers during fine-tuning. Empirically, fine-tuning the entire model is among the most successful methods for large-scale Transformers, as results from \textcite[][104--105]{raeScalingLanguageModels2022} suggest.
+Since the unlabeled sample includes various types of trades that may not be comparable to the labeled sample, we update all layers during fine-tuning. Empirically, fine-tuning the entire model is among the most successful methods, as results from \textcite[\checkmark][104--105]{raeScalingLanguageModels2022} for Transformers in general and results from \autocite[][7]{merchantWhatHappensBERT2020} specific for \gls{BERT}-like architectures document. Yet, updating all layers is the most resource-intensive.
 
-Following \textcite[][4]{rubachevRevisitingPretrainingObjectives2022}, the learning rate and weight decay are shared between the pre-training and fine-tuning stages. Given the nature of pre-training, all other hyperparameters related to the model are identical.
+Following \textcite[\checkmark][4]{rubachevRevisitingPretrainingObjectives2022}, the learning rate and weight decay are shared between the pre-training and fine-tuning stages. All other hyperparameters related to the model are identical.
 
 \subsubsection{Hyperparameter Tuning}\label{sec:hyperparameter-tuning}
 
-All of our machine-learning models feature a set of tunable hyperparameters. The results of previous studies, exemplary the one of \textcite[][5]{grinsztajnWhyTreebasedModels2022}, emphasize the need for tuning routines, as the test performance of the FT-Transformer and \glspl{GBRT} largely fluctuates with the hyperparameter configuration. Classical rules have no hyperparameters per se, but the best hybrid rules can be attained through hyperparameter search.
+All of our machine-learning models feature a set of tunable hyperparameters. The results of previous studies, exemplary the one of \textcite[\checkmark][5]{grinsztajnWhyTreebasedModels2022}, emphasize the need for tuning routines, as the test performance of the FT-Transformer and \glspl{GBRT} largely fluctuates with the hyperparameter configuration. Classical rules have no hyperparameters per se, but the best hybrid rules can be attained through hyperparameter search.
 For a fair comparison, we employ an exhaustive hyperparameter search, to find a suitable hyperparameter configuration for each of our models.
 
 \textbf{Bayesian Search}
 
-We perform a novel Bayesian search to suggest and tune the hyperparameters automatically. In Bayesian search, a prior belief for all possible objective functions is formulated from the parameter intervals, which is then gradually refined by updating the Bayesian posterior with data from previous trials thereby approximating the likely objective function \autocite[][2]{shahriariTakingHumanOut2016}. Compared to brute-force approaches, such as grid search, unpromising search regions are omitted, resulting in more promising trials.
+We run a novel Bayesian search to suggest and tune the hyperparameters automatically. In Bayesian search, a prior belief for all possible objective functions is formulated from the parameter intervals, which is then gradually refined by updating the Bayesian posterior with data from previous trials thereby approximating the likely objective function \autocite[\checkmark][149]{shahriariTakingHumanOut2016}. Compared to brute-force approaches, such as grid search, unpromising search regions are omitted, resulting in more promising trials.
 
-While different algorithmic implementations exist for Bayesian optimization, we choose the \emph{Optuna} library \autocite[][1--10]{akibaOptunaNextgenerationHyperparameter2019}, which implements the tree parzen estimator algorithm and is capable of handling both continuous and categorical hyperparameters.\footnote{Implementation of the tree-parzen estimator searches the first 10 trials randomly before the completed trials affect the sampling.} We maximize the accuracy of the validation set, which is also our decisive metric for evaluation (cp. \cref{sec:evaluation-metric}), and run $\num{50}$ trials per feature set for the \gls{GBRT} and $\num{10}$ trials for the FT-Transformer. The best combination of each is tested out-of-sample in \cref{sec:results}.
+While different algorithmic implementations exist for Bayesian optimization, we choose the \emph{Optuna} library \autocite[\checkmark][2623--2631]{akibaOptunaNextgenerationHyperparameter2019}, which implements the tree parzen estimator algorithm and is capable of handling both continuous and categorical hyperparameters.\footnote{Implementation of the tree-parzen estimator searches the first 10 trials randomly before the completed trials affect the sampling.} We maximize the accuracy of the validation set, which is also our decisive metric for evaluation (cp. \cref{sec:evaluation-metric}), and run $\num{50}$ trials per feature set for the \gls{GBRT} and $\num{10}$ trials for the FT-Transformer. The best combination of each is tested out-of-sample in \cref{sec:results}.
 
 \textbf{Gradient Boosting}
 
-Our search space is reported in \cref{tab:hyperparameter-space-gbm}, which is aligned with the recommendations in \textcites[][20]{prokhorenkovaCatBoostUnbiasedBoosting2018}[][18]{gorishniyRevisitingDeepLearning2021}[][4]{rubachevRevisitingPretrainingObjectives2022} with minor deviations.
+Our search space is reported in \cref{tab:hyperparameter-space-gbm}, which is aligned with the recommendations in \textcite[][6659]{prokhorenkovaCatBoostUnbiasedBoosting2018} with minor deviations.
 
 \begin{table}[!h]
     \centering
@@ -164,18 +166,18 @@ \subsubsection{Hyperparameter Tuning}\label{sec:hyperparameter-tuning}
         Hyperparameter               & Distribution                                  \\ \midrule
         Depth                        & $\operatorname{UniformInt}[1,12]$             \\
         Learning rate $\eta$         & $\operatorname{LogUniform}[0.001, 0.125]$     \\
-        $\ell_2$ Leaf Regularization & $\operatorname{UniformInt}[2, 30]$            \\
-        Random Strength              & $\operatorname{LogUniform}[\num{1e-9}, 10.0]$ \\
-        Bagging Temperature          & $\operatorname{Uniform}[0.0, 1.0]$            \\ \bottomrule
+        $\ell_2$ Leaf regularization & $\operatorname{UniformInt}[2, 30]$            \\
+        Random strength              & $\operatorname{LogUniform}[\num{1e-9}, 10.0]$ \\
+        Bagging temperature          & $\operatorname{Uniform}[0.0, 1.0]$            \\ \bottomrule
     \end{tabular}
 \end{table}
 
-As documented in \cref{tab:hyperparameter-space-gbm}, we tune five hyperparameters for gradient boosting. The first is the depth, which determines the number of levels in each tree. Other than \textcite[][]{gorishniyRevisitingDeepLearning2021}, we increase the upper bound to twelve to allow for more complex ensemble members. Acknowledging the research of \textcite[][14]{friedmanGreedyFunctionApproximation2001} that the learning rate \eta~and the size of the ensemble have a strong interdependence, we only tune the learning rate and stop extending the ensemble based on the early stopping criterion. Random strength, bagging temperature, and $\ell_2$ leaf regularization are measures to counter overfitting. Specifically, random strength controls the degree of Gaussian noise added to the scores of split candidates to introduce randomness in the selected splits. In a similar vein, the algorithm introduces randomness on the sample level through Bayesian bootstrap \autocite[][130--131]{rubinBayesianBootstrap1981}. The hyperparameter controls the distribution used for sampling, and implicitly the aggressiveness of bagging. Finally, $\ell_2$ leaf regularization adds a penalty term to the terminal leaf's estimates. The hyperparameter controls the degree of regularization.
+As documented, we tune five hyperparameters for gradient boosting. The first is the depth, which determines the number of levels in each tree. Other than \textcite[\checkmark][18952]{gorishniyRevisitingDeepLearning2021}, we increase the upper bound to twelve to allow for more complex ensemble members. Acknowledging the research of \textcite[\checkmark][1203]{friedmanGreedyFunctionApproximation2001} that the learning rate \eta~and the size of the ensemble have a strong interdependence, we only tune the learning rate and stop extending the ensemble based on the early stopping criterion. Random strength, bagging temperature, and $\ell_2$ leaf regularization are measures to counter overfitting. Specifically, random strength controls the degree of Gaussian noise added to the scores of split candidates to introduce randomness in the selected splits. In a similar vein, the algorithm introduces randomness on the sample level through Bayesian bootstrap \autocite[\checkmark][130--131]{rubinBayesianBootstrap1981}. The hyperparameter controls the distribution used for sampling, and implicitly the aggressiveness of bagging. Finally, $\ell_2$ leaf regularization adds a penalty term to the terminal leaf's estimates. The hyperparameter controls the degree of regularization.
 
 \cref{fig:ise-gbm-hyperparam-classical} visualizes the hyperparameter search space of the \gls{GBRT} on the \gls{ISE} dataset with classical features, from which we can derive several observations. First, hyperparameter tuning has a significant impact on the prediction, as the validation accuracy varies between \SI{58.429}{\percent} and \SI{64.378}{\percent} for different trials. Second, the best hyperparameter combination, marked with \bestcircle, lies off-the-borders surrounded by other promising trials, indicated by the contours, from which we can conclude, that the found solution is a stable and reasonable choice for further analysis.
 
 \begin{figure}[!h]
-    \subfloat[Hyperparameter Search Space of \gls{GBRT} With Feature Set Classical\label{fig:ise-gbm-hyperparam-classical}]{\includegraphics[width=0.6\textwidth]{1gzk7msy-hyperparam-search-space.pdf}}
+    \subfloat[Hyperparameter Search Space of \gls{GBRT} With Feature Set Classic\label{fig:ise-gbm-hyperparam-classical}]{\includegraphics[width=0.6\textwidth]{1gzk7msy-hyperparam-search-space.pdf}}
     \vfill
     \subfloat[Hyperparameter Search Space of \gls{GBRT} With Feature Set Size\label{fig:ise-gbm-hyperparam-classical-size}]{\includegraphics[width=0.6\textwidth]{3vntumoi-hyperparam-search-space.pdf}}
     \caption[Hyperparameter Search Space of Gradient-Boosting]{Hyperparameter Search Space of \gls{GBRT} on \gls{ISE} Validation Set}
@@ -189,57 +191,59 @@ \subsubsection{Hyperparameter Tuning}\label{sec:hyperparameter-tuning}
 
 In \cref{fig:ise-gbm-hyperparam-classical-size} we repeat the analysis for \gls{GBRT} trained on size features. The loss surface is smooth with large connected regions. As the best solution achieving \SI{75.03504680858162}{\percent} accuracy lies within a splayed region of dense sampling, it is a good choice for further analysis. Consistent with the loss surface of \cref{fig:ise-gbm-hyperparam-classical}, the trees are grown to the maximum depth and a high learning rate, indicating the need for complex ensemble members highly corrective to previous predictions. Part of this could be due to the low signal-to-noise ratio in financial data.
 
-The loss surface of the \gls{GBRT} trained on the feature set including option features is the least fragmented. While the validation accuracy of the best combinations improves significantly to \SI{76.99459643967347}{\percent}, worst trials even underperform these of smaller feature sets. Based on this observation we conjecture, that more data does not per se improve the model and that models require a thoughtful tuning procedure. By this means, our conclusion contradicts the one of \textcite[][14]{ronenMachineLearningTrade2022}, who find no advantage in tuning tree-based ensembles for trade classification. Results are tabulated in \cref{tab:solutions-gbm}.
+The loss surface of the \gls{GBRT} trained on the feature set including option features is the least fragmented. While the validation accuracy of the best combinations improves significantly to \SI{76.99459643967347}{\percent}, worst trials even underperform these of smaller feature sets. We note, more data does not per se improve the model and that models require a thoughtful tuning procedure. Our conclusion contradicts the one of \textcite[\checkmark][14]{ronenMachineLearningTrade2022}, who find no advantage in tuning tree-based ensembles for trade classification. Results are tabulated in \cref{tab:solutions-gbm}.
 
 \begin{table}[!h]
     \centering
-    \sisetup{table-format=3.2, table-number-alignment=right}
-    \caption[Search Solutions of Gradient Boosting]{Search solutions of gradient boosting. The three right columns document the best combination in terms of validation accuracy per feature set. We perform \num{50} trials each.}
+    \sisetup{table-format=3.2,table-alignment-mode = none, table-number-alignment=left, table-text-alignment = left}
+    \caption[Search Solutions of Gradient Boosting]{Search solutions of gradient boosting. The three right columns document the best combination in terms of validation accuracy per feature set. We perform \num{50} trials.}
     \label{tab:solutions-gbm}
     \begin{tabular}{@{}llSSS@{}}
         \toprule
-        Hyperparameter               & Distribution                                  & {\glsentryshort{FS} Classical} & {\glsentryshort{FS} Size} & {\glsentryshort{FS} Option} \\ \midrule
-        Depth                        & $\operatorname{UniformInt}[1,12]$             & 8                              & 9                         & 12                          \\
-        Learning rate $\eta$         & $\operatorname{LogUniform}[0.001, 0.125]$     & 0.12484221864046671            & 0.12347889459796775       & 0.12471458170177774         \\
-        $\ell_2$ Leaf Regularization & $\operatorname{UniformInt}[2, 30]$            & 15                             & 5                         & 16                          \\
-        Random Strength              & $\operatorname{LogUniform}[\num{1e-9}, 10.0]$ & \num{4e-9}                     & \num{4e-7}                & \num{8e-6}                  \\
-        Bagging Temperature          & $\operatorname{Uniform}[0.0, 1.0]$            & 0.6419530220498153             & 0.5574912093427532        & 0.45578836944233            \\ \midrule
-        Validation Accuracy in \%    &                                               & 64.37816236230594              & 75.03504680858162         & 76.99459643967347           \\ \bottomrule
+        Hyperparameter               & Distribution                                  & {\glsentryshort{FS} Classic} & {\glsentryshort{FS} Size} & {\glsentryshort{FS} Option} \\ \midrule
+        Depth                        & $\operatorname{UniformInt}[1,12]$             & 8                            & 9                         & 12                          \\
+        Learning rate $\eta$         & $\operatorname{LogUniform}[0.001, 0.125]$     & 0.12484221864046671          & 0.12347889459796775       & 0.12471458170177774         \\
+        $\ell_2$ Leaf regularization & $\operatorname{UniformInt}[2, 30]$            & 15                           & 5                         & 16                          \\
+        Random strength              & $\operatorname{LogUniform}[\num{1e-9}, 10.0]$ & \num{4e-9}                   & \num{4e-7}                & \num{8e-6}                  \\
+        Bagging temperature          & $\operatorname{Uniform}[0.0, 1.0]$            & 0.6419530220498153           & 0.5574912093427532        & 0.45578836944233            \\ \midrule
+        Validation Accuracy in \%    &                                               & 64.37816236230594            & 75.03504680858162         & 76.99459643967347           \\ \bottomrule
     \end{tabular}
 \end{table}
 
+\vskip 3in
+
 \textbf{Gradient Boosting With Self-Training}
 
-The search space for the semi-supervised variant is identical to the supervised gradient boosting. To conserve space, we only report the tabulated results in \cref{tab:solutions-GBRT-self-training}. Visualizations of the hyperparameter search space are available online.\footnote{See \url{https://wandb.ai/fbv/thesis/runs/37lymmzc} for \gls{FS} classical, \url{https://wandb.ai/fbv/thesis/runs/324v3uv5} for \gls{FS} size, and \url{https://wandb.ai/fbv/thesis/runs/t55nd8r0} for \gls{FS} option.}
+The search space for the semi-supervised variant is identical to the supervised gradient boosting. To conserve space, we only report the tabulated results in \cref{tab:solutions-GBRT-self-training}.\footnote{Visualizations of the hyperparameter search space are available online. See \url{https://wandb.ai/fbv/thesis/runs/37lymmzc} for \gls{FS} classic, \url{https://wandb.ai/fbv/thesis/runs/324v3uv5} for \gls{FS} size, and \url{https://wandb.ai/fbv/thesis/runs/t55nd8r0} for \gls{FS} option.}
 
 \begin{table}[!h]
     \centering
-    \sisetup{table-format=3.2, table-number-alignment=right}
-    \caption[Search Solutions of Gradient Boosting With Self-Training]{Search solutions of gradient boosting with self-training. The three right columns document the best combination in terms of validation accuracy per feature set. We perform \num{50} trials each. Arrows indicate the change compared to the supervised variant.}
+    \sisetup{table-format=3.2,table-alignment-mode = none, table-number-alignment=left, table-text-alignment = left}
+    \caption[Search Solutions of Gradient Boosting With Self-Training]{Search solutions of gradient boosting with self-training. The three right columns document the best combination in terms of validation accuracy per feature set. We perform \num{50} trials. Arrows indicate the change compared to the supervised variant.}
     \label{tab:solutions-GBRT-self-training}
     \begin{tabular}{@{}llSSS@{}}
         \toprule
-        Hyperparameter               & Distribution                                  & {\glsentryshort{FS} Classical}         & {\glsentryshort{FS} Size}              & {\glsentryshort{FS} Option}            \\ \midrule
-        Depth                        & $\operatorname{UniformInt}[1,12]$             & 9                                      & 10                                     & 9                                      \\
-        Learning rate $\eta$         & $\operatorname{LogUniform}[0.001, 0.125]$     & 0.12337960608926582                    & 0.1248422186404667                     & 0.12347504812996231                    \\
-        $\ell_2$ Leaf Regularization & $\operatorname{UniformInt}[2, 30]$            & 12                                     & 9                                      & 13                                     \\
-        Random Strength              & $\operatorname{LogUniform}[\num{1e-9}, 10.0]$ & \num{2e-8}                             & \num{5e-8}                             & \num{5e-8}                             \\
-        Bagging Temperature          & $\operatorname{Uniform}[0.0, 1.0]$            & 0.34010535578784745                    & 0.5214954412829511                     & 0.4666577105566224                     \\ \midrule
-        \multicolumn{2}{l}{Validation Accuracy in \%}                                               & {$\textcolor{viz-red}{\downarrow} \num{64.29671279599335}$} & {$\textcolor{viz-red}{\downarrow} \num{74.83010065958079}$} & {$\textcolor{viz-red}{\downarrow} \num{76.41433947686962}$} \\ \bottomrule
+        Hyperparameter               & Distribution                                  & {\glsentryshort{FS} Classic}                                & {\glsentryshort{FS} Size}                                   & {\glsentryshort{FS} Option}                                 \\ \midrule
+        Depth                        & $\operatorname{UniformInt}[1,12]$             & 9                                                           & 10                                                          & 9                                                           \\
+        Learning rate $\eta$         & $\operatorname{LogUniform}[0.001, 0.125]$     & 0.12337960608926582                                         & 0.1248422186404667                                          & 0.12347504812996231                                         \\
+        $\ell_2$ Leaf regularization & $\operatorname{UniformInt}[2, 30]$            & 12                                                          & 9                                                           & 13                                                          \\
+        Random strength              & $\operatorname{LogUniform}[\num{1e-9}, 10.0]$ & \num{2e-8}                                                  & \num{5e-8}                                                  & \num{5e-8}                                                  \\
+        Bagging temperature          & $\operatorname{Uniform}[0.0, 1.0]$            & 0.34010535578784745                                         & 0.5214954412829511                                          & 0.4666577105566224                                          \\ \midrule
+        Validation Accuracy in \%    &                                               & {$\textcolor{viz-red}{\downarrow} \num{64.29671279599335}$} & {$\textcolor{viz-red}{\downarrow} \num{74.83010065958079}$} & {$\textcolor{viz-red}{\downarrow} \num{76.41433947686962}$} \\ \bottomrule
     \end{tabular}
 \end{table}
 
-Matching the supervised results, semi-supervised ensembles exhaust the maximum tree depth and combine trees with a coarse learning rate. By parameter importance, both are most influential on the final result. Again, this is an indication that the trade data is not easily separable, requiring multiple features and splits. The found hyperparameters for $\ell_2$ leaf regularization, random strength and bagging are balanced. Overall, the best validation accuracies are slightly inferior to the supervised variant.
+Matching the supervised results, semi-supervised ensembles exhaust the maximum tree depth and combine trees with a coarse learning rate. By parameter importance, both are most influential on the final result. Again, this is an indication that the trade data is not easily separable, requiring multiple features and splits. The found hyperparameters for $\ell_2$ leaf regularization, random strength, and bagging are balanced. Overall, the best validation accuracies are slightly inferior to the supervised variant.
 
 \clearpage
 
 \textbf{FT-Transformer}
 
-The search space for the FT-Transformer is identical to \textcite[][18]{gorishniyRevisitingDeepLearning2021} (variant (b)) with minor deviations and reported in \cref{tab:hyperparameter-space-2}.
+The search space for the FT-Transformer is identical to \textcite[\checkmark][18950]{gorishniyRevisitingDeepLearning2021} (variant (b)) with minor deviations and reported in \cref{tab:hyperparameter-space-2}.
 
 \begin{table}[!h]
     \centering
-    \sisetup{table-text-alignment=right}
+    \sisetup{table-format=3.2,table-alignment-mode = none, table-number-alignment=left, table-text-alignment = left}
     \caption[Hyperparameter Search Space of FT-Transformer]{Hyperparameter search space of FT-Transformer.}
     \label{tab:hyperparameter-space-2}
     \begin{tabular}{@{}ll@{}}
@@ -249,7 +253,7 @@ \subsubsection{Hyperparameter Tuning}\label{sec:hyperparameter-tuning}
         Attention dropout      & $\operatorname{Uniform}[0, 0.5]$                    \\
         \gls{FFN} dropout      & $\operatorname{Uniform}[0, 0.5]$                    \\
         Learning rate $\eta$   & $\operatorname{LogUniform}[\num{3e-5}, \num{3e-4}]$ \\
-        weight decay $\lambda$ & $\operatorname{LogUniform}[\num{1e-6}, \num{1e-3}]$ \\ \bottomrule
+        Weight decay $\lambda$ & $\operatorname{LogUniform}[\num{1e-6}, \num{1e-3}]$ \\ \bottomrule
     \end{tabular}
 \end{table}
 
@@ -258,7 +262,7 @@ \subsubsection{Hyperparameter Tuning}\label{sec:hyperparameter-tuning}
 Due to the pseudo-random sampling during the first trials of Bayesian search combined with the reduced number of trials for the Transformer studies, the tested hyperparameter combinations are identical for all feature sets, as visible in \cref{fig:ise-transformer-hyperparam}.
 
 \begin{figure}[!h]
-    \subfloat[Hyperparameter Search Space of FT-Transformer With Feature Set Classical\label{fig:ise-transformer-hyperparam-classical}]{\includegraphics[width=0.6\linewidth]{3jpe46s1-hyperparam-search-space.pdf}}
+    \subfloat[Hyperparameter Search Space of FT-Transformer With Feature Set Classic\label{fig:ise-transformer-hyperparam-classical}]{\includegraphics[width=0.6\linewidth]{3jpe46s1-hyperparam-search-space.pdf}}
     \vfill
     \subfloat[Hyperparameter Search Space of FT-Transformer With Feature Set Size\label{fig:ise-transformer-hyperparam-classical-size}]{\includegraphics[width=0.6\linewidth]{1qx3ul4j-hyperparam-search-space.pdf}}
     \caption[Hyperparameter Search Space of FT-Transformer]{Hyperparameter Search Space of FT-Transformer on \gls{ISE} Validation Set}
@@ -276,19 +280,19 @@ \subsubsection{Hyperparameter Tuning}\label{sec:hyperparameter-tuning}
 
 \begin{table}[!h]
     \centering
-    \sisetup{table-format=3.2, table-number-alignment=right}
-    \caption[Search Solutions of FT-Transformer]{Search solutions of FT-Transformer. The three right columns document the best combination in terms of validation accuracy per feature set. We perform \num{10} trials each. A discussion of these results is provided below.}
+    \sisetup{table-format=3.2,table-alignment-mode = none, table-number-alignment=left, table-text-alignment = left}
+    \caption[Search Solutions of FT-Transformer]{Search solutions of FT-Transformer. The three right columns document the best combination in terms of validation accuracy per feature set. We perform \num{10} trials. A discussion of these results is provided below.}
     \label{tab:solutions-transformer}
     \begin{tabular}{@{}llSSS@{}}
         \toprule
-        Hyperparameter                       & Distribution                                        & {\glsentryshort{FS} Classical} & {\glsentryshort{FS} Size} & {\glsentryshort{FS} Option} \\ \midrule
-        Layers $L$                           & $\operatorname{UniformInt}[1,6]$                    & 4                              & 4                         & 4                           \\
-        Embedding dimension $d_{\mathrm{e}}$ & $\operatorname{UniformInt}[64, 256]$                & 248                            & 248                       & 248                         \\
-        Attention dropout                    & $\operatorname{Uniform}[0, 0.5]$                    & 0.04424625102595975            & 0.04424625102595975       & 0.04424625102595975         \\
-        \gls{FFN} dropout                    & $\operatorname{Uniform}[0, 0.5]$                    & 0.0979914312095726             & 0.0979914312095726        & 0.0979914312095726          \\
-        Learning rate $\eta$                 & $\operatorname{LogUniform}[\num{3e-5}, \num{3e-4}]$ & \num{1e-6}                     & \num{1e-6}                & \num{1e-6}                  \\
-        Weight decay $\lambda$               & $\operatorname{LogUniform}[\num{1e-6}, \num{1e-3}]$ & \num{6e-5}                     & \num{6e-5}                & \num{6e-5}                  \\ \midrule
-        \multicolumn{2}{l}{Validation Accuracy in \%}                                                     & 64.69                          & 75.42                     & 77.17                       \\ \bottomrule
+        Hyperparameter                       & Distribution                                        & {\glsentryshort{FS} Classic} & {\glsentryshort{FS} Size} & {\glsentryshort{FS} Option} \\ \midrule
+        Layers $L$                           & $\operatorname{UniformInt}[1,6]$                    & 4                            & 4                         & 4                           \\
+        Embedding dimension $d_{\mathrm{e}}$ & $\operatorname{UniformInt}[64, 256]$                & 248                          & 248                       & 248                         \\
+        Attention dropout                    & $\operatorname{Uniform}[0, 0.5]$                    & 0.04424625102595975          & 0.04424625102595975       & 0.04424625102595975         \\
+        \gls{FFN} dropout                    & $\operatorname{Uniform}[0, 0.5]$                    & 0.0979914312095726           & 0.0979914312095726        & 0.0979914312095726          \\
+        Learning rate $\eta$                 & $\operatorname{LogUniform}[\num{3e-5}, \num{3e-4}]$ & \num{1e-6}                   & \num{1e-6}                & \num{1e-6}                  \\
+        Weight decay $\lambda$               & $\operatorname{LogUniform}[\num{1e-6}, \num{1e-3}]$ & \num{6e-5}                   & \num{6e-5}                & \num{6e-5}                  \\ \midrule
+        Validation Accuracy in \%            &                                                     & 64.69                        & 75.42                     & 77.17                       \\ \bottomrule
     \end{tabular}
 \end{table}
 
@@ -296,36 +300,36 @@ \subsubsection{Hyperparameter Tuning}\label{sec:hyperparameter-tuning}
 
 \textbf{FT-Transformer With Pre-Training}
 
-The hyperparameter search space for Transformers with a pre-training objective is identical to that shown in \cref{tab:hyperparameter-space-2}. As evident from \cref{tab:solutions-transformer-pretraining}, the found solutions are identical to these of the FT-Transformer without pre-training and identical for all three feature sets. During pre-training, we can detect if a token is replaced with \SI{94.06319856643677}{\percent} to \SI{95.89540958404541}{\percent} accuracy.\footnote{Na\"ive prediction yields \SI{85}{\percent} accuracy given the chosen replacement rate.} 
+The hyperparameter search space for Transformers with a pre-training objective is identical to that shown in \cref{tab:hyperparameter-space-2}. As evident from \cref{tab:solutions-transformer-pretraining}, the found solutions are identical to these of the FT-Transformer without pre-training and identical for all three feature sets. During pre-training, we can detect if a token is replaced with \SI{94.06319856643677}{\percent} to \SI{95.89540958404541}{\percent} accuracy.\footnote{Na\"ive prediction yields \SI{85}{\percent} accuracy given the chosen replacement rate.}
 
 \begin{figure}[!h]
     \centering
     \includegraphics{transformer_ise_pretrain_classical.pdf}
-    \caption[Pre-Training Loss of FT-Transformer]{Pre-training loss on \gls{ISE} sample with \gls{FS} classical. Training is performed for 20 epochs. Loss is the mean over all batches per epoch.}
+    \caption[Pre-Training Loss of FT-Transformer]{Pre-training loss on \gls{ISE} sample with \gls{FS} classic. Training is performed for 20 epochs. Loss is the mean over all batches per epoch.}
     \label{fig:fttransformer-pretrain-loss}
 \end{figure}
 
 Pre-training performance is however bound by the available computing budget. As evident from \cref{fig:fttransformer-pretrain-loss}, the models have not fully converged until the end of pre-training, as the loss on the train- and validation set steadily improves.
 
-Validation accuracy after fine-tuning improves for all models over Transformers without pretraining. As the search space is identically sampled for both variants we can directly attribute the improvements of \SI{0.28}{\percent} to \SI{0.72}{\percent} in validation accuracy to pre-training on unlabeled trades. Visualizations of the hyperparameter search spaces are available online.\footnote{See \url{https://wandb.ai/fbv/thesis/runs/12isqh2m} for \gls{FS} classical, for \url{https://wandb.ai/fbv/thesis/runs/2hv1nayy} for \gls{FS} size, and \url{https://wandb.ai/fbv/thesis/runs/3jbqpp4r} for \gls{FS} option for details.}
+Validation accuracy after fine-tuning improves for all models over Transformers without pretraining. As the search space is identically sampled for both variants we can directly attribute the improvements of \SI{0.28}{\percent} to \SI{0.72}{\percent} in validation accuracy to pre-training on unlabeled trades. \footnote{Visualizations of the hyperparameter search spaces are available online. See \url{https://wandb.ai/fbv/thesis/runs/12isqh2m} for \gls{FS} classic, for \url{https://wandb.ai/fbv/thesis/runs/2hv1nayy} for \gls{FS} size, and \url{https://wandb.ai/fbv/thesis/runs/3jbqpp4r} for \gls{FS} option for details.}
 
 
 \begin{table}[!h]
     \centering
-    \sisetup{table-format=3.2, table-number-alignment=right}
-    \caption[Search Solutions of FT-Transformer With Pre-training]{Search solutions of FT-Transformer with pretraining. The three right columns document the best combination in terms of validation accuracy per feature set. We perform \num{10} trials each. Arrows indicate the change compared to the supervised variant.}
+    \sisetup{table-format=3.2,table-alignment-mode = none, table-number-alignment=left, table-text-alignment = left}
+    \caption[Search Solutions of FT-Transformer With Pre-training]{Search solutions of FT-Transformer with pretraining. The three right columns document the best combination in terms of validation accuracy per feature set. We perform \num{10} trials. Arrows indicate the change compared to the supervised variant.}
     \label{tab:solutions-transformer-pretraining}
     \begin{tabular}{@{}llSSS@{}}
         \toprule
-        Hyperparameter                       & Distribution                                        & {\glsentryshort{FS} Classical}       & {\glsentryshort{FS} Size}            & {\glsentryshort{FS} Option} \\ \midrule
-        Layers $L$                           & $\operatorname{UniformInt}[1,6]$                    & 4                                    & 4                                    & 4                           \\
-        Embedding dimension $d_{\mathrm{e}}$ & $\operatorname{UniformInt}[64, 256]$                & 248                                  & 248                                  & 248                         \\
-        Attention dropout                    & $\operatorname{Uniform}[0, 0.5]$                    & 0.04424625102595975                  & 0.04424625102595975                  & 0.04424625102595975         \\
-        \gls{FFN} dropout                    & $\operatorname{Uniform}[0, 0.5]$                    & 0.0979914312095726                   & 0.0979914312095726                   & 0.0979914312095726          \\
-        Learning rate $\eta$                 & $\operatorname{LogUniform}[\num{3e-5}, \num{3e-4}]$ & \num{1e-6}                           & \num{1e-6}                           & \num{1e-6}                  \\
-        Weight decay $\lambda$               & $\operatorname{LogUniform}[\num{1e-6}, \num{1e-3}]$ & \num{6e-5}                           & \num{6e-5}                           & \num{6e-5}                  \\ \midrule
-        Validation Accuracy \%               & Pre-Train                                           & {\num{95.89540958404541}}            & {\num{95.64009308815002}}           & {\num{94.06319856643677}}   \\
-                                             & Fine-Tune                                           & {$\textcolor{viz-green}{\uparrow}\num{65.13623935106421}$} & {$\textcolor{viz-green}{\uparrow} \num{75.69871634547757}$} & {$\textcolor{viz-green}{\uparrow} \num{77.8904}$}  \\ \bottomrule
+        Hyperparameter                       & Distribution                                        & {\glsentryshort{FS} Classic}                               & {\glsentryshort{FS} Size}                                   & {\glsentryshort{FS} Option}                       \\ \midrule
+        Layers $L$                           & $\operatorname{UniformInt}[1,6]$                    & 4                                                          & 4                                                           & 4                                                 \\
+        Embedding dimension $d_{\mathrm{e}}$ & $\operatorname{UniformInt}[64, 256]$                & 248                                                        & 248                                                         & 248                                               \\
+        Attention dropout                    & $\operatorname{Uniform}[0, 0.5]$                    & 0.04424625102595975                                        & 0.04424625102595975                                         & 0.04424625102595975                               \\
+        \gls{FFN} dropout                    & $\operatorname{Uniform}[0, 0.5]$                    & 0.0979914312095726                                         & 0.0979914312095726                                          & 0.0979914312095726                                \\
+        Learning rate $\eta$                 & $\operatorname{LogUniform}[\num{3e-5}, \num{3e-4}]$ & \num{1e-6}                                                 & \num{1e-6}                                                  & \num{1e-6}                                        \\
+        Weight decay $\lambda$               & $\operatorname{LogUniform}[\num{1e-6}, \num{1e-3}]$ & \num{6e-5}                                                 & \num{6e-5}                                                  & \num{6e-5}                                        \\ \midrule
+        Validation Accuracy \%               & Pre-Train                                           & {\num{95.89540958404541}}                                  & {\num{95.64009308815002}}                                   & {\num{94.06319856643677}}                         \\
+                                             & Fine-Tune                                           & {$\textcolor{viz-green}{\uparrow}\num{65.13623935106421}$} & {$\textcolor{viz-green}{\uparrow} \num{75.69871634547757}$} & {$\textcolor{viz-green}{\uparrow} \num{77.8904}$} \\ \bottomrule
     \end{tabular}
 \end{table}
 
@@ -333,15 +337,13 @@ \subsubsection{Hyperparameter Tuning}\label{sec:hyperparameter-tuning}
 
 Akin to selecting the machine learning classifiers, we select our classical baselines on the \gls{ISE} validation set. This prevents \gls{overfitting} the test set and maintains consistency between both paradigms. For the same reason, baselines are kept constant in the transfer setting on the \gls{CBOE} sample.
 
-Optimizing hybrids of trade classification rules through Bayesian search is experimentally feasible by the stacking paradigm of \cref{sec:rule-based-approaches} and by treating the rules as a tunable hyperparameter. We consider all rules from \cref{sec:rule-based-approaches} learned on adjacent quotes of the exchange and \gls{NBBO} level or adjacent prices at the exchange and inter-exchange level and stack up to six rules. To model simple rules, consisting of a single or few rules, we add an identity mapping, $\operatorname{Id}$, that defers classification to later rules in the stack. A caveat of this approach is that sampled combinations may not be economically meaningful e.g., applying depth rule after tick rule, or not effective e.g., quote rule after tick rule, assuming complete data. Despite being unexplored, a conditional search space or human-in-the-loop sampling could account for this.
-
-After all, we find no outperformance over hybrid rules already reported in the literature, as documented online.\footnote{For \gls{FS} classical our best combination of $\operatorname{quote}_{\mathrm{nbbo}} \to \operatorname{quote}_{\mathrm{ex}} \to \operatorname{rtick}_{\mathrm{all}}$ (simplified) reaches a validation accuracy of \SI{58.93934926393819}{\percent} equaling the solution of \textcite[][12]{grauerOptionTradeClassification2022}. For \gls{FS} size/option the best search solution is $\operatorname{tsize}_{\mathrm{ex}} \to \operatorname{depth}_{\mathrm{ex}} \to \operatorname{quote}_{\mathrm{nbbo}} \to \operatorname{tick}_{\mathrm{all}}$ (simplified) with \SI{69.03521015523933}{\percent} accuracy. The combination of \textcite[][14]{grauerOptionTradeClassification2022} reaches with $\operatorname{tsize}_{\mathrm{ex}} \to \operatorname{quote}_{\mathrm{nbbo}} \to \operatorname{quote}_{\mathrm{ex}} \to \operatorname{depth}_{\mathrm{nbbo}} \to \operatorname{depth}_{\mathrm{ex}} \to \operatorname{rtick}_{\mathrm{all}}$ an accuracy of \SI{69.3726}{\percent}. See \url{https://wandb.ai/fbv/thesis/runs/3f2m9c6i} and \url{https://wandb.ai/fbv/thesis/runs/16d6e4dk} for details. Experiments are run with \num{500} trials each.} Our combinations match or trail the accuracies of rules from \textcite[][12--14]{grauerOptionTradeClassification2022} on the \gls{ISE} validation set. Subsequently, we adopt their combinations as our benchmark, considering them to be the most challenging.
+Optimizing hybrids of trade classification rules through Bayesian search is experimentally feasible by the stacking paradigm of \cref{sec:stacked-rule} and by treating the rules from \cref{sec:rule-based-approaches} as a tunable hyperparameter. After all, we find no outperformance over hybrid rules already reported in the literature, as documented online.\footnote{For the performance of found combinations see \url{https://wandb.ai/fbv/thesis/runs/3f2m9c6i} and \url{https://wandb.ai/fbv/thesis/runs/16d6e4dk} for details. Experiments are run with \num{500} trials. Performance of rules for literature is documented in \cref{tab:ise-classical}.} Our combinations match or trail the accuracies of rules from \textcite[\checkmark][13--15]{grauerOptionTradeClassification2022} on the \gls{ISE} validation set. Subsequently, we adopt their combinations as our benchmark, considering them to be the most challenging.
 
 From all candidate algorithms, a combination of quote and tick rules, $\operatorname{quote}_{\mathrm{nbbo}} \to \operatorname{quote}_{\mathrm{ex}} \to \operatorname{rtick}_{\mathrm{all}}$, where the quote rule first applied to the \gls{NBBO} and then to quotes of the \gls{ISE} followed by the reverse tick rule at inter-exchange level, performs best reaching a validation accuracy of \SI{58.76225138074204}{\percent}. For brevity, we refer to this combination as the \gls{GSU} method (small). It can be estimated using features from feature set one, which qualifies it as a benchmark.
 
-For the second feature set involving size-related rules, we consider rules that involve the trade size or depth rule. Consistent with the recommendation of \textcite[][14]{grauerOptionTradeClassification2022}, we find that a deep stack of the $\operatorname{tsize}_{\mathrm{ex}} \to \operatorname{quote}_{\mathrm{nbbo}} \to \operatorname{quote}_{\mathrm{ex}} \to \operatorname{depth}_{\mathrm{nbbo}} \to \operatorname{depth}_{\mathrm{ex}} \to \operatorname{rtick}_{\mathrm{all}}$ achieves the highest validation accuracy. We refer to this lengthy combination as the \gls{GSU} method (large). Much of the performance gains are owed to the trade size and depth rules, which reduce the dependence on the reverse tick test as a last resort and provide overrides for trades at the quotes, improving validation accuracy to \SI{69.37267458589436}{\percent}. Due to the extended use of the quoted sizes and trade sizes, it is our benchmark for the second feature set.
+For the second feature set involving size-related rules, we consider rules that involve the trade size or depth rule. Consistent with the recommendation of \textcite[\checkmark][15]{grauerOptionTradeClassification2022}, a deep stack of the $\operatorname{tsize}_{\mathrm{ex}} \to \operatorname{quote}_{\mathrm{nbbo}} \to \operatorname{quote}_{\mathrm{ex}} \to \operatorname{depth}_{\mathrm{nbbo}} \to \operatorname{depth}_{\mathrm{ex}} \to \operatorname{rtick}_{\mathrm{all}}$ achieves the highest validation accuracy. We refer to this lengthy combination as the \gls{GSU} method (large). Much of the performance gains are owed to the trade size and depth rules, which reduce the dependence on the reverse tick test as a last resort and provide overrides for trades at the quotes, improving validation accuracy to \SI{69.37267458589436}{\percent}. Due to the extended use of the quoted sizes and trade sizes, it is our benchmark for the second feature set.
 
-In the absence of other baselines, we repeatedly compare against the same method as a baseline for the third feature set, even if it doesn't involve option-specific features.
+In the absence of other baselines, we repeatedly compare against the same rule as a baseline for the third feature set, even if it doesn't involve option-specific features.
 
-In the direct comparison between the validation accuracies of classical rules and our classifiers, the validation accuracies of classical rules considerably underperform the learned classifier. \cref{sec:results} discusses if the results hold for the test sets. But before we do so, we present the metrics used
+When comparing the validation accuracies of classical rules and our classifiers, the validation accuracies of classical rules considerably underperform the learned classifier. \cref{sec:results} discusses if the results hold for the test sets. But before, we present the metrics used
 for evaluation.
\ No newline at end of file
diff --git a/reports/Graphs/1gzk7msy-hyperparam-search-space.pdf b/reports/Graphs/1gzk7msy-hyperparam-search-space.pdf
index 4af6936b..8acc6065 100644
Binary files a/reports/Graphs/1gzk7msy-hyperparam-search-space.pdf and b/reports/Graphs/1gzk7msy-hyperparam-search-space.pdf differ
diff --git a/reports/Graphs/1qx3ul4j-hyperparam-search-space.pdf b/reports/Graphs/1qx3ul4j-hyperparam-search-space.pdf
index 8ca2ccc1..e30b3884 100644
Binary files a/reports/Graphs/1qx3ul4j-hyperparam-search-space.pdf and b/reports/Graphs/1qx3ul4j-hyperparam-search-space.pdf differ
diff --git a/reports/Graphs/2h81aiow-hyperparam-search-space.pdf b/reports/Graphs/2h81aiow-hyperparam-search-space.pdf
index 9284d35c..a3f65aa2 100644
Binary files a/reports/Graphs/2h81aiow-hyperparam-search-space.pdf and b/reports/Graphs/2h81aiow-hyperparam-search-space.pdf differ
diff --git a/reports/Graphs/2t5zo50f-hyperparam-search-space.pdf b/reports/Graphs/2t5zo50f-hyperparam-search-space.pdf
index 47a995e0..3ef629be 100644
Binary files a/reports/Graphs/2t5zo50f-hyperparam-search-space.pdf and b/reports/Graphs/2t5zo50f-hyperparam-search-space.pdf differ
diff --git a/reports/Graphs/3jpe46s1-hyperparam-search-space.pdf b/reports/Graphs/3jpe46s1-hyperparam-search-space.pdf
index 9afabff7..60571fa8 100644
Binary files a/reports/Graphs/3jpe46s1-hyperparam-search-space.pdf and b/reports/Graphs/3jpe46s1-hyperparam-search-space.pdf differ
diff --git a/reports/Graphs/3vntumoi-hyperparam-search-space.pdf b/reports/Graphs/3vntumoi-hyperparam-search-space.pdf
index f4d5a903..abb8639e 100644
Binary files a/reports/Graphs/3vntumoi-hyperparam-search-space.pdf and b/reports/Graphs/3vntumoi-hyperparam-search-space.pdf differ
diff --git a/reports/Graphs/classical_at_mid_over_time.pdf b/reports/Graphs/classical_at_mid_over_time.pdf
index aac974d4..8634823c 100644
Binary files a/reports/Graphs/classical_at_mid_over_time.pdf and b/reports/Graphs/classical_at_mid_over_time.pdf differ
diff --git a/reports/thesis.tex b/reports/thesis.tex
index 37a215dd..1609e7cb 100644
--- a/reports/thesis.tex
+++ b/reports/thesis.tex
@@ -10,6 +10,8 @@
 \DeclareLanguageMapping{british}{british-apa}
 % \renewcommand{\thepage}{} % Removes the page number
 \addbibresource{Content/bibliography.bib}
+\usepackage{xurl} % url breaks in biblatex
+% https://tex.stackexchange.com/questions/567789/adding-line-break-to-bibliography-with-long-numerical-doi
 
 % Format and layout
 \usepackage[left=3cm,right=3cm,bottom=3cm]{geometry} % Specifies left and right side margins.
@@ -49,7 +51,7 @@
 width=0.2cm,minimum height=0.2cm,inner sep=0pt] at (0,0) {};}}
 
 \newcommand{\bestcircle}{\tikz{\node[circle,draw=darkgray, fill=white, line width=0.5pt, minimum width=0.2cm,minimum height=0.2cm, inner sep=0pt, draw opacity=.2] at (0,0){};}}
-\newcommand{\myline}{\tikz{\draw[dashed, gray, line width=0.5pt] (0,0) -- (0,0.3);}}
+\newcommand{\myline}{\tikz{\draw[dashed, gray, line width=1pt] (0,0) -- (0,0.3);}}
 
 \usepackage{enumitem} % enumerate with letters https://tex.stackexchange.com/a/129960
 
@@ -120,7 +122,7 @@
 % Please fill in this information once at the beginning. This way, gaps will be filled in automatically in the following.
 \newcommand{\name}{Markus Bilz} % Enter your name.
 \newcommand{\dateofthesis}{15 July 2023} % Enter the submission date of your thesis.
-\newcommand{\titleofthesis}{Learn the Rules: Improving Option Trade Classification With Machine Learning} % Enter the title of your thesis.
+\newcommand{\titleofthesis}{Improving Option Trade Classification With Machine Learning} % Enter the title of your thesis.
 \newcommand{\streetadress}{Mathystr.~14-16 // XI-11} % Enter your street address.
 \newcommand{\postalcode}{76133} % Enter your postal code.
 \newcommand{\city}{Karlsruhe} % Enter your city/town.
@@ -212,7 +214,7 @@
 \newacronym{OTM}{OTM}{out-of-the-money}
 \newacronym{RMSE}{RMSE}{root mean squared error}
 \newacronym{RF}{RF}{random forest}
-\newacronym{ReLU}{ReLU}{Rectified Linear Units}
+\newacronym{ReLU}{ReLU}{Rectified Linear Unit}
 \newacronym{SAGE}{SAGE}{Shapley Additive Global importancE}
 \newacronym{SSE}{SSE}{sum of squared errors}
 \newacronym{SHAP}{SHAP}{SHapley Additive exPlanations}
diff --git a/src/otc/utils/dups.pl b/src/otc/utils/dups.pl
new file mode 100644
index 00000000..1df98816
--- /dev/null
+++ b/src/otc/utils/dups.pl
@@ -0,0 +1,55 @@
+#!/usr/bin/env perl
+
+# Finds duplicate adjacent words.
+# From: https://matt.might.net/articles/shell-scripts-for-passive-voice-weasel-words-duplicates/
+
+use strict ;
+
+my $DupCount = 0 ;
+
+if (!@ARGV) {
+  print "usage: dups <file> ...\n" ;
+  exit ;
+}
+
+while (1) {
+  my $FileName = shift @ARGV ;
+
+  # Exit code = number of duplicates found.
+  exit $DupCount if (!$FileName) ;
+
+  open FILE, $FileName or die $!;
+
+  my $LastWord = "" ;
+  my $LineNum = 0 ;
+
+  while (<FILE>) {
+    chomp ;
+
+    $LineNum ++ ;
+
+    my @words = split (/(\W+)/) ;
+
+    foreach my $word (@words) {
+      # Skip spaces:
+      next if $word =~ /^\s*$/ ;
+
+      # Skip punctuation:
+      if ($word =~ /^\W+$/) {
+        $LastWord = "" ;
+        next ;
+      }
+
+      # Found a dup?
+      if (lc($word) eq lc($LastWord)) {
+        print "$FileName:$LineNum $word\n" ;
+        $DupCount ++ ;
+      } # Thanks to Sean Cronin for tip on case.
+
+      # Mark this as the last word:
+      $LastWord = $word ;
+    }
+  }
+
+  close FILE ;
+}
diff --git a/src/otc/utils/passive.sh b/src/otc/utils/passive.sh
new file mode 100644
index 00000000..7ae40ac3
--- /dev/null
+++ b/src/otc/utils/passive.sh
@@ -0,0 +1,65 @@
+#!/bin/bash
+# from https://matt.might.net/articles/shell-scripts-for-passive-voice-weasel-words-duplicates/
+irregulars="awoken|\
+been|born|beat|\
+become|begun|bent|\
+beset|bet|bid|\
+bidden|bound|bitten|\
+bled|blown|broken|\
+bred|brought|broadcast|\
+built|burnt|burst|\
+bought|cast|caught|\
+chosen|clung|come|\
+cost|crept|cut|\
+dealt|dug|dived|\
+done|drawn|dreamt|\
+driven|drunk|eaten|fallen|\
+fed|felt|fought|found|\
+fit|fled|flung|flown|\
+forbidden|forgotten|\
+foregone|forgiven|\
+forsaken|frozen|\
+gotten|given|gone|\
+ground|grown|hung|\
+heard|hidden|hit|\
+held|hurt|kept|knelt|\
+knit|known|laid|led|\
+leapt|learnt|left|\
+lent|let|lain|lighted|\
+lost|made|meant|met|\
+misspelt|mistaken|mown|\
+overcome|overdone|overtaken|\
+overthrown|paid|pled|proven|\
+put|quit|read|rid|ridden|\
+rung|risen|run|sawn|said|\
+seen|sought|sold|sent|\
+set|sewn|shaken|shaven|\
+shorn|shed|shone|shod|\
+shot|shown|shrunk|shut|\
+sung|sunk|sat|slept|\
+slain|slid|slung|slit|\
+smitten|sown|spoken|sped|\
+spent|spilt|spun|spit|\
+split|spread|sprung|stood|\
+stolen|stuck|stung|stunk|\
+stridden|struck|strung|\
+striven|sworn|swept|\
+swollen|swum|swung|taken|\
+taught|torn|told|thought|\
+thrived|thrown|thrust|\
+trodden|understood|upheld|\
+upset|woken|worn|woven|\
+wed|wept|wound|won|\
+withheld|withstood|wrung|\
+written"
+
+if [ "$1" = "" ]; then
+ echo "usage: `basename $0` <file> ..."
+ exit
+fi
+
+egrep -n -i --color \
+ "\\b(am|are|were|being|is|been|was|be)\
+\\b[ ]*(\w+ed|($irregulars))\\b" $*
+
+exit $?
diff --git a/src/otc/utils/weasel.sh b/src/otc/utils/weasel.sh
new file mode 100644
index 00000000..7bff4358
--- /dev/null
+++ b/src/otc/utils/weasel.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+# from  https://matt.might.net/articles/shell-scripts-for-passive-voice-weasel-words-duplicates/
+weasels="many|various|very|fairly|several|extremely\
+|exceedingly|quite|remarkably|few|surprisingly\
+|mostly|largely|huge|tiny|((are|is) a number)\
+|excellent|interestingly|significantly\
+|substantially|clearly|vast|relatively|completely"
+
+wordfile=""
+
+# Check for an alternate weasel file
+if [ -f $HOME/etc/words/weasels ]; then
+    wordfile="$HOME/etc/words/weasels"
+fi
+
+if [ -f $WORDSDIR/weasels ]; then
+    wordfile="$WORDSDIR/weasels"
+fi
+
+if [ -f words/weasels ]; then
+    wordfile="words/weasels"
+fi
+
+if [ ! "$wordfile" = "" ]; then
+    weasels="xyzabc123";
+    for w in `cat $wordfile`; do
+        weasels="$weasels|$w"
+    done
+fi
+
+
+if [ "$1" = "" ]; then
+ echo "usage: `basename $0` <file> ..."
+ exit
+fi
+
+egrep -i -n --color "\\b($weasels)\\b" $*
+
+exit $?