Clean up thesis 🫁 (#422)

KarelZe · Jul 9, 2023 · 2fbb4d9 · 2fbb4d9
1 parent d9d57ea
commit 2fbb4d9
Show file tree

Hide file tree

Showing 63 changed files with 1,180 additions and 1,600 deletions.
diff --git a/notebooks/1.0-mb-data-preprocessing-mem-reduce.ipynb b/notebooks/1.0-mb-data-preprocessing-mem-reduce.ipynb
@@ -18,6 +18,8 @@
    },
    "outputs": [],
    "source": [
+    "import os\n",
+    "\n",
     "import gcsfs\n",
     "import google.auth\n",
     "import modin.config as cfg\n",
@@ -28,8 +30,6 @@
     "cfg.Engine.put(\"dask\")\n",
     "ProgressBar.enable()\n",
     "\n",
-    "import os\n",
-    "\n",
     "import wandb\n",
     "from tqdm.auto import tqdm\n"
    ]
@@ -91,7 +91,7 @@
     "    \"\"\"\n",
     "    create a dataframe and optimize its memory usage.\n",
     "\n",
-    "    I. e., apply some optimizations i.e, manual inference of dtypes, pre-selection\n",
+    "    I.e., apply some optimizations i.e, manual inference of dtypes, pre-selection\n",
     "    of unique columns and chunking to enable import.\n",
     "\n",
     "    Adapted from here:\n",
@@ -201,19 +201,19 @@
    "outputs": [],
    "source": [
     "def df_to_parquet(\n",
-    "    df: pd.DataFrame, target_dir: str, chunk_size: int = 1000000, **parquet_wargs\n",
+    "    x: pd.DataFrame, target_dir: str, chunk_size: int = 1000000, **parquet_wargs\n",
     ") -> None:\n",
     "    \"\"\"\n",
     "    Write pd.DataFrame to parquet format.\n",
     "\n",
     "    Args:\n",
-    "        df (pd.DataFrame): input dataframe.\n",
+    "        x (pd.DataFrame): input dataframe.\n",
     "        target_dir (str): local directory where parquet files are written to.\n",
     "        chunk_size (int, optional): number of rows stored in one chunk of parquet file.\n",
     "        Defaults to 1000000.\n",
     "    \"\"\"\n",
-    "    for i in tqdm(range(0, len(df), chunk_size)):\n",
-    "        slc = df.iloc[i : i + chunk_size]\n",
+    "    for i in tqdm(range(0, len(x), chunk_size)):\n",
+    "        slc = x.iloc[i : i + chunk_size]\n",
     "        chunk = int(i / chunk_size)\n",
     "        output_path = (\n",
     "            target_dir\n",

diff --git a/notebooks/2.0a-mb-data-preprocessing-supervised.ipynb b/notebooks/2.0a-mb-data-preprocessing-supervised.ipynb
@@ -19,8 +19,6 @@
     "from catboost import CatBoostClassifier, Pool\n",
     "from numpy.testing import assert_almost_equal\n",
     "from pandas._testing.asserters import assert_almost_equal\n",
-    "from sklearn.metrics import roc_auc_score\n",
-    "from sklearn.model_selection import cross_val_predict\n",
     "from tqdm.auto import tqdm\n",
     "\n",
     "sys.path.append(\"..\")\n"
@@ -104,9 +102,6 @@
     "    for i in range(0, max_i)\n",
     "]\n",
     "\n",
-    "# asks = [f\"ASK_{i}\" for i in range(1, 17)]\n",
-    "# bids = [f\"BID_{i}\" for i in range(1, 17)]\n",
-    "\n",
     "columns = [\n",
     "    \"QUOTE_DATETIME\",\n",
     "    \"ROOT\",\n",
@@ -129,8 +124,6 @@
     "    \"price_ex_lag\",\n",
     "    \"issue_type\",\n",
     "    \"myn\",\n",
-    "    # *asks,\n",
-    "    # *bids,\n",
     "    \"buy_sell\",\n",
     "]\n",
     "\n",
@@ -156,23 +149,6 @@
     "df.memory_usage(deep=True).sum()\n"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/",
-     "height": 739
-    },
-    "id": "gHnFz65rZnOZ",
-    "outputId": "e5d3b5d7-3e01-4483-ba82-97793c5ebc68",
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "df.head().T\n"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -268,6 +244,7 @@
    "outputs": [],
    "source": [
     "# indices\n",
+    "train_range, val_range, test_range = None, None, None\n",
     "\n",
     "if EXCHANGE == \"ise\" and STRATEGY == \"supervised\":\n",
     "    train_range = df.QUOTE_DATETIME.between(\n",

diff --git a/notebooks/2.0b-mb-data-preprocessing-unsupervised.ipynb b/notebooks/2.0b-mb-data-preprocessing-unsupervised.ipynb
@@ -12,7 +12,6 @@
     "import os\n",
     "\n",
     "import gcsfs\n",
-    "import numpy as np\n",
     "import pandas as pd\n",
     "import wandb\n",
     "\n",
@@ -161,7 +160,7 @@
    },
    "outputs": [],
    "source": [
-    "df.head()"
+    "df.head()\n"
    ]
   },
   {
@@ -187,7 +186,7 @@
    },
    "outputs": [],
    "source": [
-    "labelled_df.head()"
+    "labelled_df.head()\n"
    ]
   },
   {
@@ -227,7 +226,7 @@
     "    \"bid_ex\",\n",
     "    \"bid_size_ex\",\n",
     "    \"ask_size_ex\",\n",
-    "    # FIXME is different for loballed and unlabelled trades\n",
+    "    # Can be different for lobelled and unlabelled trades:\n",
     "    # 'optionid','issue_type', 'myn',\n",
     "    # 'price_all_lead', 'price_all_lag',\n",
     "    # 'price_ex_lead', 'price_ex_lag',\n",
@@ -266,7 +265,7 @@
    },
    "outputs": [],
    "source": [
-    "labelled_df['index_labelled'] = labelled_df.index"
+    "labelled_df[\"index_labelled\"] = labelled_df.index\n"
    ]
   },
   {
@@ -307,10 +306,10 @@
     "        \"bid_ex\",\n",
     "        \"bid_size_ex\",\n",
     "        \"ask_size_ex\",\n",
-    "        # myn seems to be different\n",
-    "        #'issue_type', 'optionid',\n",
-    "        # 'price_all_lead', 'price_all_lag', # FIXME is different for loballed and unlabelled trades\n",
-    "        # 'price_ex_lead', 'price_ex_lag', # FIXME is different for loballed and unlabelled trades\n",
+    "        # myn seems to be different for labelled and unlabelled trades\n",
+    "        # 'issue_type', 'optionid',\n",
+    "        # 'price_all_lead', 'price_all_lag',\n",
+    "        # 'price_ex_lead', 'price_ex_lag',\n",
     "    ],\n",
     "    how=\"left\",\n",
     "    indicator=\"exists\",\n",
@@ -326,7 +325,7 @@
    },
    "outputs": [],
    "source": [
-    "df_w_indicator.head(50)"
+    "df_w_indicator.head(50)\n"
    ]
   },
   {
@@ -339,7 +338,7 @@
    "source": [
     "# interpolate missing indices. index increases 1 -> 2. So filling with float seems ok. will be inserted between int of labelled df.\n",
     "df_w_indicator[\"index_labelled\"].interpolate(\"linear\", inplace=True)\n",
-    "df_w_indicator.set_index(keys = \"index_labelled\", drop=True, inplace=True)"
+    "df_w_indicator.set_index(keys=\"index_labelled\", drop=True, inplace=True)\n"
    ]
   },
   {
@@ -409,63 +408,6 @@
     "df_w_indicator[df_w_indicator[\"exists\"] == \"both\"].head(20).T\n"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Analysis of Accucacies"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "# foo = df_w_indicator[df_w_indicator[\"exists\"] == \"both\"][\n",
-    "#     [\"price_ex_lag_labelled\", \"price_ex_lag_unlabelled\", \"buy_sell\", \"TRADE_PRICE\"]\n",
-    "# ]\n",
-    "\n",
-    "# foo[\"tick_unlabelled\"] = np.where(\n",
-    "#     foo[\"TRADE_PRICE\"] > foo[\"price_ex_lag_unlabelled\"],\n",
-    "#     1,\n",
-    "#     np.where(foo[\"TRADE_PRICE\"] < foo[\"price_ex_lag_unlabelled\"], -1, np.nan),\n",
-    "# )\n",
-    "# foo[\"tick_labelled\"] = np.where(\n",
-    "#     foo[\"TRADE_PRICE\"] > foo[\"price_ex_lag_labelled\"],\n",
-    "#     1,\n",
-    "#     np.where(foo[\"TRADE_PRICE\"] < foo[\"price_ex_lag_labelled\"], -1, np.nan),\n",
-    "# )\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "# foo.head()\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "# acc_unlabelled = (foo.buy_sell == foo.tick_unlabelled).sum() / len(foo)\n",
-    "# acc_labelled = (foo.buy_sell == foo.tick_labelled).sum() / len(foo)\n",
-    "\n",
-    "# print(acc_unlabelled)\n",
-    "# print(acc_labelled)\n"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -481,10 +423,10 @@
    "source": [
     "# use last 6 months. May increase later\n",
     "date_range = df_w_indicator.QUOTE_DATETIME.between(\n",
-    "        \"2013-04-24 00:00:00\", \"2013-10-24 16:14:48\"\n",
+    "    \"2013-04-24 00:00:00\", \"2013-10-24 16:14:48\"\n",
     ")\n",
     "\n",
-    "df_w_indicator = df_w_indicator[date_range]"
+    "df_w_indicator = df_w_indicator[date_range]\n"
    ]
   },
   {

diff --git a/notebooks/3.0a-mb-explanatory-data-analysis.ipynb b/notebooks/3.0a-mb-explanatory-data-analysis.ipynb
@@ -8,6 +8,8 @@
    },
    "outputs": [],
    "source": [
+    "from __future__ import annotations\n",
+    "\n",
     "import os\n",
     "import random\n",
     "import warnings\n",
@@ -16,9 +18,7 @@
     "\n",
     "warnings.simplefilter(action=\"ignore\", category=FutureWarning)\n",
     "\n",
-    "from __future__ import annotations\n",
-    "\n",
-    "from typing import Any, List, Tuple\n",
+    "from typing import Any, List\n",
     "\n",
     "import matplotlib.pyplot as plt\n",
     "import numpy as np\n",

diff --git a/notebooks/3.0b-mb-explanatory-matched-unmatched.ipynb b/notebooks/3.0b-mb-explanatory-matched-unmatched.ipynb
@@ -7,32 +7,25 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "from __future__ import annotations\n",
+    "\n",
     "import wandb\n",
-    "import sys\n",
-    "import warnings\n",
     "from pathlib import Path\n",
-    "import optuna\n",
     "import pandas as pd\n",
     "\n",
     "import os\n",
-    "import sys\n",
     "\n",
     "from otc.features.build_features import (\n",
-    "    features_categorical,\n",
-    "    features_classical,\n",
     "    features_classical_size,\n",
-    "    features_ml,\n",
     ")\n",
     "\n",
-    "from __future__ import annotations\n",
     "\n",
-    "from typing import Any, List, Tuple\n",
+    "from typing import List\n",
     "\n",
     "import matplotlib.pyplot as plt\n",
     "import numpy as np\n",
     "import pandas as pd\n",
     "import seaborn as sns\n",
-    "from scipy import stats\n",
     "\n",
     "from tqdm.auto import tqdm"
    ]