Skip to content

Commit

Permalink
Clean up thesis 🫁 (#422)
Browse files Browse the repository at this point in the history
  • Loading branch information
KarelZe authored Jul 9, 2023
1 parent d9d57ea commit 2fbb4d9
Show file tree
Hide file tree
Showing 63 changed files with 1,180 additions and 1,600 deletions.
14 changes: 7 additions & 7 deletions notebooks/1.0-mb-data-preprocessing-mem-reduce.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
},
"outputs": [],
"source": [
"import os\n",
"\n",
"import gcsfs\n",
"import google.auth\n",
"import modin.config as cfg\n",
Expand All @@ -28,8 +30,6 @@
"cfg.Engine.put(\"dask\")\n",
"ProgressBar.enable()\n",
"\n",
"import os\n",
"\n",
"import wandb\n",
"from tqdm.auto import tqdm\n"
]
Expand Down Expand Up @@ -91,7 +91,7 @@
" \"\"\"\n",
" create a dataframe and optimize its memory usage.\n",
"\n",
" I. e., apply some optimizations i.e, manual inference of dtypes, pre-selection\n",
" I.e., apply some optimizations i.e, manual inference of dtypes, pre-selection\n",
" of unique columns and chunking to enable import.\n",
"\n",
" Adapted from here:\n",
Expand Down Expand Up @@ -201,19 +201,19 @@
"outputs": [],
"source": [
"def df_to_parquet(\n",
" df: pd.DataFrame, target_dir: str, chunk_size: int = 1000000, **parquet_wargs\n",
" x: pd.DataFrame, target_dir: str, chunk_size: int = 1000000, **parquet_wargs\n",
") -> None:\n",
" \"\"\"\n",
" Write pd.DataFrame to parquet format.\n",
"\n",
" Args:\n",
" df (pd.DataFrame): input dataframe.\n",
" x (pd.DataFrame): input dataframe.\n",
" target_dir (str): local directory where parquet files are written to.\n",
" chunk_size (int, optional): number of rows stored in one chunk of parquet file.\n",
" Defaults to 1000000.\n",
" \"\"\"\n",
" for i in tqdm(range(0, len(df), chunk_size)):\n",
" slc = df.iloc[i : i + chunk_size]\n",
" for i in tqdm(range(0, len(x), chunk_size)):\n",
" slc = x.iloc[i : i + chunk_size]\n",
" chunk = int(i / chunk_size)\n",
" output_path = (\n",
" target_dir\n",
Expand Down
25 changes: 1 addition & 24 deletions notebooks/2.0a-mb-data-preprocessing-supervised.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,6 @@
"from catboost import CatBoostClassifier, Pool\n",
"from numpy.testing import assert_almost_equal\n",
"from pandas._testing.asserters import assert_almost_equal\n",
"from sklearn.metrics import roc_auc_score\n",
"from sklearn.model_selection import cross_val_predict\n",
"from tqdm.auto import tqdm\n",
"\n",
"sys.path.append(\"..\")\n"
Expand Down Expand Up @@ -104,9 +102,6 @@
" for i in range(0, max_i)\n",
"]\n",
"\n",
"# asks = [f\"ASK_{i}\" for i in range(1, 17)]\n",
"# bids = [f\"BID_{i}\" for i in range(1, 17)]\n",
"\n",
"columns = [\n",
" \"QUOTE_DATETIME\",\n",
" \"ROOT\",\n",
Expand All @@ -129,8 +124,6 @@
" \"price_ex_lag\",\n",
" \"issue_type\",\n",
" \"myn\",\n",
" # *asks,\n",
" # *bids,\n",
" \"buy_sell\",\n",
"]\n",
"\n",
Expand All @@ -156,23 +149,6 @@
"df.memory_usage(deep=True).sum()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 739
},
"id": "gHnFz65rZnOZ",
"outputId": "e5d3b5d7-3e01-4483-ba82-97793c5ebc68",
"tags": []
},
"outputs": [],
"source": [
"df.head().T\n"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand Down Expand Up @@ -268,6 +244,7 @@
"outputs": [],
"source": [
"# indices\n",
"train_range, val_range, test_range = None, None, None\n",
"\n",
"if EXCHANGE == \"ise\" and STRATEGY == \"supervised\":\n",
" train_range = df.QUOTE_DATETIME.between(\n",
Expand Down
82 changes: 12 additions & 70 deletions notebooks/2.0b-mb-data-preprocessing-unsupervised.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
"import os\n",
"\n",
"import gcsfs\n",
"import numpy as np\n",
"import pandas as pd\n",
"import wandb\n",
"\n",
Expand Down Expand Up @@ -161,7 +160,7 @@
},
"outputs": [],
"source": [
"df.head()"
"df.head()\n"
]
},
{
Expand All @@ -187,7 +186,7 @@
},
"outputs": [],
"source": [
"labelled_df.head()"
"labelled_df.head()\n"
]
},
{
Expand Down Expand Up @@ -227,7 +226,7 @@
" \"bid_ex\",\n",
" \"bid_size_ex\",\n",
" \"ask_size_ex\",\n",
" # FIXME is different for loballed and unlabelled trades\n",
" # Can be different for lobelled and unlabelled trades:\n",
" # 'optionid','issue_type', 'myn',\n",
" # 'price_all_lead', 'price_all_lag',\n",
" # 'price_ex_lead', 'price_ex_lag',\n",
Expand Down Expand Up @@ -266,7 +265,7 @@
},
"outputs": [],
"source": [
"labelled_df['index_labelled'] = labelled_df.index"
"labelled_df[\"index_labelled\"] = labelled_df.index\n"
]
},
{
Expand Down Expand Up @@ -307,10 +306,10 @@
" \"bid_ex\",\n",
" \"bid_size_ex\",\n",
" \"ask_size_ex\",\n",
" # myn seems to be different\n",
" #'issue_type', 'optionid',\n",
" # 'price_all_lead', 'price_all_lag', # FIXME is different for loballed and unlabelled trades\n",
" # 'price_ex_lead', 'price_ex_lag', # FIXME is different for loballed and unlabelled trades\n",
" # myn seems to be different for labelled and unlabelled trades\n",
" # 'issue_type', 'optionid',\n",
" # 'price_all_lead', 'price_all_lag',\n",
" # 'price_ex_lead', 'price_ex_lag',\n",
" ],\n",
" how=\"left\",\n",
" indicator=\"exists\",\n",
Expand All @@ -326,7 +325,7 @@
},
"outputs": [],
"source": [
"df_w_indicator.head(50)"
"df_w_indicator.head(50)\n"
]
},
{
Expand All @@ -339,7 +338,7 @@
"source": [
"# interpolate missing indices. index increases 1 -> 2. So filling with float seems ok. will be inserted between int of labelled df.\n",
"df_w_indicator[\"index_labelled\"].interpolate(\"linear\", inplace=True)\n",
"df_w_indicator.set_index(keys = \"index_labelled\", drop=True, inplace=True)"
"df_w_indicator.set_index(keys=\"index_labelled\", drop=True, inplace=True)\n"
]
},
{
Expand Down Expand Up @@ -409,63 +408,6 @@
"df_w_indicator[df_w_indicator[\"exists\"] == \"both\"].head(20).T\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Analysis of Accucacies"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"# foo = df_w_indicator[df_w_indicator[\"exists\"] == \"both\"][\n",
"# [\"price_ex_lag_labelled\", \"price_ex_lag_unlabelled\", \"buy_sell\", \"TRADE_PRICE\"]\n",
"# ]\n",
"\n",
"# foo[\"tick_unlabelled\"] = np.where(\n",
"# foo[\"TRADE_PRICE\"] > foo[\"price_ex_lag_unlabelled\"],\n",
"# 1,\n",
"# np.where(foo[\"TRADE_PRICE\"] < foo[\"price_ex_lag_unlabelled\"], -1, np.nan),\n",
"# )\n",
"# foo[\"tick_labelled\"] = np.where(\n",
"# foo[\"TRADE_PRICE\"] > foo[\"price_ex_lag_labelled\"],\n",
"# 1,\n",
"# np.where(foo[\"TRADE_PRICE\"] < foo[\"price_ex_lag_labelled\"], -1, np.nan),\n",
"# )\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"# foo.head()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"# acc_unlabelled = (foo.buy_sell == foo.tick_unlabelled).sum() / len(foo)\n",
"# acc_labelled = (foo.buy_sell == foo.tick_labelled).sum() / len(foo)\n",
"\n",
"# print(acc_unlabelled)\n",
"# print(acc_labelled)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand All @@ -481,10 +423,10 @@
"source": [
"# use last 6 months. May increase later\n",
"date_range = df_w_indicator.QUOTE_DATETIME.between(\n",
" \"2013-04-24 00:00:00\", \"2013-10-24 16:14:48\"\n",
" \"2013-04-24 00:00:00\", \"2013-10-24 16:14:48\"\n",
")\n",
"\n",
"df_w_indicator = df_w_indicator[date_range]"
"df_w_indicator = df_w_indicator[date_range]\n"
]
},
{
Expand Down
6 changes: 3 additions & 3 deletions notebooks/3.0a-mb-explanatory-data-analysis.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
},
"outputs": [],
"source": [
"from __future__ import annotations\n",
"\n",
"import os\n",
"import random\n",
"import warnings\n",
Expand All @@ -16,9 +18,7 @@
"\n",
"warnings.simplefilter(action=\"ignore\", category=FutureWarning)\n",
"\n",
"from __future__ import annotations\n",
"\n",
"from typing import Any, List, Tuple\n",
"from typing import Any, List\n",
"\n",
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
Expand Down
13 changes: 3 additions & 10 deletions notebooks/3.0b-mb-explanatory-matched-unmatched.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -7,32 +7,25 @@
"metadata": {},
"outputs": [],
"source": [
"from __future__ import annotations\n",
"\n",
"import wandb\n",
"import sys\n",
"import warnings\n",
"from pathlib import Path\n",
"import optuna\n",
"import pandas as pd\n",
"\n",
"import os\n",
"import sys\n",
"\n",
"from otc.features.build_features import (\n",
" features_categorical,\n",
" features_classical,\n",
" features_classical_size,\n",
" features_ml,\n",
")\n",
"\n",
"from __future__ import annotations\n",
"\n",
"from typing import Any, List, Tuple\n",
"from typing import List\n",
"\n",
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"import pandas as pd\n",
"import seaborn as sns\n",
"from scipy import stats\n",
"\n",
"from tqdm.auto import tqdm"
]
Expand Down
Loading

0 comments on commit 2fbb4d9

Please sign in to comment.