diff --git a/notebooks/4.0a-mb-classical-rules.ipynb b/notebooks/4.0a-mb-classical-rules.ipynb
index ac8b6328..d9ace9e5 100644
--- a/notebooks/4.0a-mb-classical-rules.ipynb
+++ b/notebooks/4.0a-mb-classical-rules.ipynb
@@ -2312,7 +2312,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.4 (tags/v3.9.4:1f2e308, Apr  6 2021, 13:40:21) [MSC v.1928 64 bit (AMD64)]"
+   "version": "3.9.4"
   },
   "vscode": {
    "interpreter": {
diff --git a/notebooks/4.0d-feature-importances.ipynb b/notebooks/4.0d-mb-feature-importances.ipynb
similarity index 100%
rename from notebooks/4.0d-feature-importances.ipynb
rename to notebooks/4.0d-mb-feature-importances.ipynb
diff --git a/notebooks/4.0e-mb-effective-spread.ipynb b/notebooks/4.0e-mb-effective-spread.ipynb
new file mode 100644
index 00000000..e085b21e
--- /dev/null
+++ b/notebooks/4.0e-mb-effective-spread.ipynb
@@ -0,0 +1,133 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import numpy.typing as npt\n",
+    "from sklearn.metrics import make_scorer\n",
+    "from sklearn.utils import check_consistent_length"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def effective_spread(\n",
+    "    y_pred: npt.NDArray, trade_price: npt.NDArray, fundamental_value: npt.NDArray\n",
+    ") -> np.float64:\n",
+    "    \"\"\"\n",
+    "    Calculate the effective spread given by:\n",
+    "    $$\n",
+    "    S_{i,t} = 2 (P_{i,t} - V_{i,t}) D_{i,t}\n",
+    "    $$\n",
+    "\n",
+    "    Args:\n",
+    "        y_pred (npt.NDArray): indicator if the trade is a buy or sell\n",
+    "        trade_price (npt.NDArray): trade price\n",
+    "        fundamental_value (npt.NDArray): fundamental value e. g., bid-ask midpoint.\n",
+    "    Returns:\n",
+    "        float: average effective spread\n",
+    "    \"\"\"\n",
+    "    check_consistent_length(y_pred, trade_price, fundamental_value)\n",
+    "    return np.mean(2 * (trade_price - fundamental_value) * y_pred)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "y_pred = np.random.choice([-1,1], size=(10))\n",
+    "trade_price = np.random.rand(10) * 100\n",
+    "fundamental_value = np.random.rand(10) * 100\n",
+    "\n",
+    "eff_sp = effective_spread(y_pred, trade_price, fundamental_value)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([-83.57555436, -94.25680187,  92.10142797,  41.24263376,\n",
+       "       168.57843754,  94.69759222,  39.67382461,  81.81819241,\n",
+       "       135.25950003, -79.62206844])"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "2* (trade_price - fundamental_value) * y_pred"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "39.591718386351225"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "eff_sp"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "score = make_scorer(effective_spread, greater_is_better=True)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.4"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "f8ea8b642289b706932f10b33ee389827410dbaef0ce2c5bf73615e8d3267d88"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/notebooks/5.0-mb-accelerate-tabtransformer-Copy1.ipynb b/notebooks/5.0-mb-accelerate-tabtransformer.ipynb
similarity index 100%
rename from notebooks/5.0-mb-accelerate-tabtransformer-Copy1.ipynb
rename to notebooks/5.0-mb-accelerate-tabtransformer.ipynb
diff --git a/notebooks/06-mb-visualization-positional-encoding.ipynb b/notebooks/6.0-mb-visualization-positional-encoding.ipynb
similarity index 100%
rename from notebooks/06-mb-visualization-positional-encoding.ipynb
rename to notebooks/6.0-mb-visualization-positional-encoding.ipynb
diff --git a/references/obsidian/.obsidian/workspace.json b/references/obsidian/.obsidian/workspace.json
index cb91d6b6..ba58fb81 100644
--- a/references/obsidian/.obsidian/workspace.json
+++ b/references/obsidian/.obsidian/workspace.json
@@ -6,6 +6,7 @@
       {
         "id": "aeb1f3ebc9e06223",
         "type": "tabs",
+        "dimension": 55.5595798623687,
         "children": [
           {
             "id": "f35a250acbe1a2dc",
@@ -22,6 +23,7 @@
       {
         "id": "90bbdd3c2ee01632",
         "type": "tabs",
+        "dimension": 44.44042013763129,
         "children": [
           {
             "id": "dad26e0868f82374",
@@ -36,18 +38,6 @@
               },
               "pinned": true
             }
-          },
-          {
-            "id": "00542bf949f61a9b",
-            "type": "leaf",
-            "state": {
-              "type": "markdown",
-              "state": {
-                "file": "📖chapters/🚏Exploratory Data Analysis.md",
-                "mode": "source",
-                "source": false
-              }
-            }
           }
         ]
       }
@@ -107,6 +97,7 @@
             "state": {
               "type": "backlink",
               "state": {
+                "file": "📖chapters/🪦graveyard of ideas.md",
                 "collapseAll": false,
                 "extraContext": false,
                 "sortOrder": "alphabetical",
@@ -133,7 +124,9 @@
             "type": "leaf",
             "state": {
               "type": "outline",
-              "state": {}
+              "state": {
+                "file": "📖chapters/🪦graveyard of ideas.md"
+              }
             }
           },
           {
@@ -163,26 +156,37 @@
       "markdown-importer:Open format converter": false
     }
   },
-  "active": "f35a250acbe1a2dc",
+  "active": "dad26e0868f82374",
   "lastOpenFiles": [
+    "🖼️Media/effective-spead-options.png",
+    "📑notes/🍕Application study notes.md",
+    "📖chapters/🍕Application study.md",
+    "📥Inbox/@petersonEvaluationBiasesExecution2003.md",
+    "📥Inbox/@hagstromerBiasEffectiveBidask2021.md",
+    "📥Inbox/@Piwowar_2006.md",
+    "📥Inbox/@ellisAccuracyTradeClassification2000.md",
+    "📑notes/🍔Layer norm notes.md",
+    "📥Inbox/@leeMarketIntegrationPrice1993.md",
+    "📥Inbox/@petersenPostedEffectiveSpreads1994.md",
+    "📖chapters/🪦graveyard of ideas.md",
+    "📑notes/🔢Trade Initiator notes.md",
+    "📖chapters/🔢Trade initiator.md",
+    "📖chapters/🔢Quote rule.md",
+    "📑notes/🔢Depth rule notes.md",
+    "📥Inbox/@goettlerEquilibriumDynamicLimit2005.md",
+    "📥Inbox/@muravyevOptionsTradingCosts2020.md",
+    "📖chapters/🛌Token Embedding.md",
+    "🖼️Media/asymmetric-spread.png",
+    "🖼️Media/midpoint-spread.png",
+    "🖼️Media/eff-spread-finucane.png",
+    "📑notes/🛌 Token embeddings notes.md",
+    "📖chapters/💤Embeddings For Tabular Data.md",
     "📖chapters/👨‍🍳Tain-Test-split.md",
     "📑notes/👨‍🍳Train-Test-split notes.md",
     "🖼️Media/viz-training-schemes.png",
     "📖chapters/🚏Exploratory Data Analysis.md",
     "📥Inbox/@lopezdepradoAdvancesFinancialMachine2018.md",
     "📥Inbox/@nagelMachineLearningAsset2021.md",
-    "📖chapters/🧭Kernel SHAP.md",
-    "📥Inbox/@kaufmanLeakageDataMining2012.md",
-    "📖chapters/🍪Selection Of Supervised Approaches.md",
-    "📥Inbox/@raschkaModelEvaluationModel2020.md",
-    "📖chapters/🧭Evaluation metric.md",
-    "📖chapters/🪦graveyard of ideas.md",
-    "📖chapters/🤖FTTransformer.md",
-    "📑notes/🤖FTTransformer notes.md",
-    "📖chapters/🤖TabTransformer.md",
-    "📖chapters/🍔Layer Norm.md",
-    "📑notes/🤖TabTransformer notes.md",
-    "📑notes/🍔Layer norm notes.md",
-    "📖chapters/🧭Attention Map.md"
+    "📖chapters/🧭Kernel SHAP.md"
   ]
 }
\ No newline at end of file
diff --git "a/references/obsidian/\360\237\223\221notes/\360\237\215\225Application study notes.md" "b/references/obsidian/\360\237\223\221notes/\360\237\215\225Application study notes.md"
new file mode 100644
index 00000000..9c91a780
--- /dev/null
+++ "b/references/obsidian/\360\237\223\221notes/\360\237\215\225Application study notes.md"	
@@ -0,0 +1,74 @@
+**Intuitive Explanation from Hagströmer**
+Accurate liquidity measurement is important for liquidity timing and order routing. One of the most prevalent measures is the effective spread, defined as the percentage difference between the transaction price and the bid-ask spread midpoint. For example, if the quotes for a stock are $10.00 to sell and $10.01 to buy, the effective spread of a buy trade at $10.01 is half a cent (about 5 basis points).
+![[midpoint-spread.png]]
+
+To see the logic of the effective spread, consider the market maker who sells to the incoming trader. The market maker provides the service of immediate execution, and in return earns a premium relative the fundamental value of the shares. The effective spread captures that premium, using the spread midpoint as a proxy for the fundamental value.
+
+**Problem of effective spreads and why it is a problem:**
+To see this, assume that the fundamental value of the stock in the example above is 10 dollars and 0.25 cents. The effective spread is then asymmetric; 0.25 cents for trades on the bid side and 0.75 cents (three times higher) on the ask side. If traders care about transaction costs, the relatively wide ask-side spread deters buyers, whereas the tight bid-side spread may attract sellers. There are then more traders submitting market orders at the bid side, and the true effective spread is, on average, smaller than the average midpoint effective spread (which is 0.5 cents).
+![[asymmetric-spread.png]]
+
+“The midpoint effective spread bias can be illustrated by a simple example. Consider a stock with a fundamental value of USD 25.0025 that has liquidity supplied at the nearest prices where trading is allowed, USD 25.00 and USD 25.01. The effective spread is then asymmetric. For trades executed at the bid price it is half a cent (2 × 0.25 cents), whereas for ask-side trades it is three times higher, 1.50 cents (2 × 0.75 cents). If investors factor in the cost asymmetry in their trading decisions, market orders are in this example more likely to arrive on the bid side than on the ask side. The effective spread is then, on average, smaller than the midpoint effective spread (which is one cent).” ([[@hagstromerBiasEffectiveBidask2021]], p. 315)
+
+- **Why it is important:** 
+- capture true spread faced by traders.
+- - “We define the true transaction cost for a market order as the difference between the average transaction price and the true value of the asset.” (Goettler et al., 2005, p. 2172)
+- When trades are executed inside or outside the posted spread, the quoted spread does no longer represent the true spread for a trader. 
+- “When trades are executed inside the posted bid-ask spread, the posted spread is no longer an accurate measure of transactions costs faced by investors.” (Petersen and Fialkowski, 1994, p. 209)
+- Petersen,M., and D. Fialkowski. "Postedversus Effective Spreads: Good Prices or Bad Quotes?" Journalof FinancialEconomics,35 (1994), 269-292. [[@petersenPostedEffectiveSpreads1994]]
+- “The effective bid-ask spread is one of the most prevalent measures of market illiquidity, used in diverse applications ranging from the evaluation of market structure changes (e.g., Hendershott et al., 2011) and transaction cost measures (e.g., Hasbrouck, 2009), to asset pricing (e.g., Korajczyk and Sadka, 2008), corporate finance (e.g., Fang et al., 2009), and macroeconomics (e.g., Næs et al., 2011).” ([[@hagstromerBiasEffectiveBidask2021]], p. 314)
+- **quoted spread vs. effective spread:** “In the simple security markets described in standard financial models e.g., Roll (1984), trades take **place at the prices** posted by the specialist. Since market orders purchase stock at the ask and sell stock at the bid, these orders pay the spread between the bid and the ask. The spread arises due to the costs of  and because liquidity providers face some traders who know more about the future value of the security than they Treynor (1971) and Glosten and Milgrom (1985). In U.S. equity markets, however, trades inside the spread are a frequent occurrence.” Market orders may transact inside the spread if the specialist does not always display the best public limit orders. Market orders may also trade at prices better than the posted quotes when they are matched with other market orders. Once trades occur inside the posted spread, the posted spread overstates an investor’s expected trading costs. Since investors can expect to buy at prices lower than the ask and sell at prices higher than the bid, the effective spread is the relevant measure of trading costs.” (Petersen and Fialkowski, 1994, p. 210)
+
+- **Difference between quoted spread and effective spread:**
+Found at: https://berkeley-defi.github.io/assets/material/Bid-Ask-Spreads-Measuring-Trade-Execution-Costs-in-Financial-Markets-2010.pdf
+- The Quoted Spread The simplest measure of trading cost is the quoted spread (QS), which is defined as the difference between the bid and ask prices. The quoted spread measures the cost of completing a round trip (buy and sell), if trades are executed at the quoted prices. Execution costs for a single trade are often measured as half the spread, described on a percentage basis by equation (1): Quoted half-spread = QSit = 100 * (Askit – Bidit) / (2 times Mit) (1) where Ait and Bit are the posted ask price and bid price for security i at time t, respectively, and Mit, the quote midpoint or mean of Ait and Bit, is a proxy for the true underlying security value. The Effective Spread In many dealer markets, including those that trade fixed-income securities and foreign exchange, the quoted prices are simply a starting point for negotiations between customers and dealers, and transactions frequently occur at prices other than the quotes. Also, in some markets, including those relying on trading floors, there may be latent liquidity not reflected in the quotes. On the New York Stock Exchange (NYSE), for example, market orders may execute at prices within the quotes when the specialist (the NYSE’s designated dealer) or a floor broker elects to improve on the quote (see ). Many electronic exchanges allow traders to 5 hide some or all of the order size, implying that limit orders offering more attractive prices than the quotes may exist on the book (see 9). 
+- Further, quoted prices pertain only to the quoted depth; large orders might exhaust the depth at the quote and “walk up the book”, executing against limit orders with less attractive prices and leading to a weighted-average trade price outside the quotes. When trades occur either within or outside the quotes, a better measure of trading costs is the percentage effective half spread, which is based on the actual trade price, and is computed on a percentage basis as described in equation (2): Effective half-spread = ESit = 100 * Dit * (Pit – Vit) / Vit (2) where Pit is the transaction price for security i at time t, Dit is an indicator variable that equals one for customer buy orders and negative one for customer sell orders, and Vit is an observable proxy for the true underlying value of security i at time t. The effective spread is based on the deviation between the execution price and the true underlying value of the security, and can be viewed as an estimate of the execution cost actually paid by the trader and the gross revenue earned by the liquidity provider.
+
+- **What it is:** “The effective spread is the difference between the transaction price and the “true value” ([[@goettlerEquilibriumDynamicLimit2005]]). As Bessembinder and Venkataraman (2010) explain, in empirical analyses the effective spread is computed using “an observable proxy for the true underlying value of (the) security.” Researchers often use the bid-ask midpoint as a proxy for the value and thus measure the effective half-spread as the difference between the transaction price and the bid-ask midpoint, adjusted for trade direction. If the timing of executions is uncorrelated with the errors in using the midpoint as a proxy, using it will result in unbiased estimates of average trading costs even though the errors resulting from the use of the proxy in place of the “true value” or fair value result in noisy estimates of the effective spread for individual trades. But if the timing of executions is correlated with the errors in using the midpoint as a proxy then the estimates of trading costs will be biased. This happens, for example, if executions are more likely to occur after some traders have entered aggressive limit orders, or if some traders are slow to cancel stale limit orders.” ([[@muravyevOptionsTradingCosts2020]], p. 4975)
+
+- **effective spread for options:** “Because executions at the ask (bid) price tend to occur when the estimates of options’ fair values are close to the ask (bid) price, the difference between the option value and the bid-ask midpoint is large conditional on a trade, and correlated with the trade direction. For example, an execution at the ask price might occur when the estimate of fair value is two cents below the ask price but the quote midpoint is five cents below the ask price. In this case the effective half-spread based on the fair value is only two cents but the conventional estimate based on the midpoint would be five cents.” ([[@muravyevOptionsTradingCosts2020]], p. 4975)
+- **Limitations:** “Lee (1993) and Blume and Goldstein (1992b) compare transaction prices to the midpoint of the quoted spread. This method accurately estimates the effective spread only when market orders are matched solely with limit orders (the specialist or a public limit order). If market orders are matched with each other, then this method will overestimate the spread actually paid. Some of the market orders will pay significantly less than the posted spread.” ([[@petersenPostedEffectiveSpreads1994]], p. 210)
+
+- **Why it is important:** “While this may seem a relatively modest improvement, a more compelling case for the proposed algorithm arises in specific applications. An important ap? plication of trade classification algorithms is the computation of effective bid-ask spreads, which measure the difference between actual trade prices and quotes (see Petersen and Fialkowski (1994)). When trades routinely transact between the quotes, this measure will give a smaller, and more accurate, estimate of trans? actions costs than do posted bid-ask spreads.” ([[@ellisAccuracyTradeClassification2000]]p. 540)
+
+- “Conceptually, the effective bid-ask spread measures the cost of immediate execution, defined as twice the difference between the transaction price and the fundamental value. Whereas transaction prices are widely disseminated in financial markets, the fundamental value is unobservable. Empirical implementations of the effective bidask spread instead rely on the average of the best bid and ask prices, known as the “midpoint,” as its benchmark (Blume and Goldstein, 1992 (unpublished?); [[@leeMarketIntegrationPrice1993]]). The use of the midpoint as a proxy for the fundamental value goes back to Demsetz (1968).” ([[@hagstromerBiasEffectiveBidask2021]], p. 314)
+
+- “One measure of this split is the transaction costs paid by market order submitters.” (Goettler et al., 2005, p. 2172)
+- “We define the true transaction cost for a market order as the difference between the average transaction price and the true value of the asset.” (Goettler et al., 2005, p. 2172)
+
+- “In the first set of tests, the liquidity premium for each trade is computed as the absolute difference between the actual trade price and the "midspread" (average of bid and ask prices) of the prevailing quote at the time of the trade. Assuming that, on average, the specialist's spread is set symmetrically around the equilibrium price, the liquidity premium provides an estimate of the effective "half-spread" for each trade."” ([[@leeMarketIntegrationPrice1993]]1018)
+
+- **Description in Savickas:** To assess the applicability of trade classification rules to estimating the effective bid-ask spread in the options market, we compute the dollar spread as follows: $S_i=2\left(P_{t, i}-P_{m, i}\right) I$, where $S_i$ is the effective spread implied by the price of transaction $i ; P_{m, i}$ is the bid-ask spread midpoint using quotes outstanding at the time of transaction $i ; P_{t, i}$ is the option price at transaction $i$; and $I=1$ if trans'action $i$ is a buy and $I=-1$ if it is a sell. The percentage spread is computed as $P S_i=S_i / P_{m, i}$. Dollar and percentage spreads estimated by each of the classifiication techniques are averaged over the entire sample and compared to the actual iaverage effective spreads. The results are in Table 5. All four methods perform poorly at estimating effective bid-ask spread for options. ([[@savickasInferringDirectionOption2003]], p. 7)
+
+- “The effective spread is calculated as Effective spread = 2 * I ([transaction price - midpoint price]), where / is an indicator variable that equals one for buy trades and negative one for sell trades, and the midpoint is the average of the bid and ask prices. Obviously, correct classification of buy and sell trades is crucial in calculations of the effective spread.14” ([[@ellisAccuracyTradeClassification2000]], p. 541)
+
+- **Motivation in Theissen:** “Usually, an estimate of the effective bid-ask spread is obtained by relating the transaction price to the midpoint of the bid and ask quote, i.e. 200 tt e t pm sm − = (1) where pt is the transaction price and mt is the midpoint of the bid and ask quote at the moment when the transaction occurs. We multiply the measure by 200 and divide it by the midquote to obtain an estimate of the percentage spread (rather than the half-spread). This measure im” (Theissen, 2000, p. 11)
+- “12 plicitly assumes that transactions at prices above [below] the midpoint are buyer-initiated [seller-initiated]. Put differently, the assumption is made that the quote test classifies all transactions correctly. If this assumption is not true, some transactions at prices below the midpoint are buys and some transactions at prices above the midpoint are sells. This will result in a downward bias of the estimated effective spread irrespective of whether there is any systematic pattern in the classification accuracy. If the true trade classification is known and denoted true I (with true I equal to 1 [-1] for a buyer-initiated [seller-initiated] transaction), the effective spread can be estimated as () 200 true t t e t Ipm sm − = (2) Lightfood et al. (1999) compare this estimate to the traditional spread estimate and find that the bias of the traditional measure is economically significant, amounting to up to 30%.” (Theissen, 2000, p. 12)
+- **Statistical test:** “We repeated this analysis with our dataset from the Frankfurt Stock Exchange. The results are presented in columns 2 and 3 of Table 5. The bias is even more dramatic. The traditional spread estimate is, on average, about twice as large as the “true” spread.8 A Wilcoxon test rejects the null hypothesis of equal medians (p < 0.01). Despite the large differences, the correlation between the two spread estimates is very high (ρ= 0.96). The magnitude of the relative bias (i.e., the traditional spread estimate divided by the “true” spread) is strongly negatively related to the classification accuracy. The correlation is –0.84.” ([[@theissenTestAccuracyLee2000]] p. 12)
+- “. **effective spread and relative effective spread** The trading cost measures considered here are the effective spread and the relative effective spread. The effective spread, which represents the round trip execution costs, less commissions, is calculated as: Effective spread ¼ 2 D ðPrice midpointÞ where D is the trade direction, þ1 for a buy, and 1 for a sell. Using only TAQ data, one must infer D: The midpoint must also be estimated because the TAQ data report the trade time, not the order submission time. As noted in Bacidore et al. (1999), execution quality is most appropriately measured setting the benchmark quote to that prevailing at order submission time. The relative effective spread is calculated as Relative effective spread ¼ Effective spread=price: Next, we demonstrate the possible limitations of trade and quote data in estimating trading costs by providing two specific examples—price improvement in minimum variation markets and quote changes prior to trade execution.” ([[@petersonEvaluationBiasesExecution2003]] p. 261)
+
+- “Although it is clear that the classification errors associated with LR's algorithm and the tick test are significant, it is important to consider whether em? pirical research employing these methods might be biased by the errors. Since these methods are most commonly applied in estimating effective bid-ask spreads and signed volume, the possibility that imperfectly inferring trade direction might bias empirical inferences is tested by comparing true effective spreads to effective spreads estimated using LR's method and the tick test, and by comparing true signed volume to signed volume estimated using the two methods of inferring trade direction. The sample of 144 firms is first divided into dollar volume deciles, and mean effective percentage bid-ask spreads are estimated for each decile. Effective spreads are measured as twice the percentage half-spread for each observation, (3) ESl7 = 2(l00)(Dit)(Sit-Mit)/Mih where Du = 1 for purchases and ? 1 for sales, Sit is the trade price for stock i at time r, and Mit is the midpoint of the quoted spread. Du is measured in three ways: using the true order direction from the TORQ order file, using LR's algorithm, and using the tick test.13 The true effective spreads, estimated effective spreads, and quoted spreads are presented in (see screenshot below).” (([[@finucaneDirectTestMethods2000]], p. 569)
+- ${ }^{13}$ Since LR's algorithm always assigns a direction of one to trades above the midpoint and $-1$ to trades below the midpoint, and since the effective spread for trades at the midpoint is equal to zero, an equivalent expression for the effective spread under LR's method is: $\mathrm{ES}_{i t}=2(100)\left|S_{i t}-M_{i t}\right| / M_{i t}$. ([[@finucaneDirectTestMethods2000]]569)
+
+![[eff-spread-finucane.png]]
+
+![[effective-spead-options.png]]
+(copied from [[@savickasInferringDirectionOption2003]]897)
+
+**Calculate differences:** Panel A indicates improvements can be made when using different quote lags and/ or trade direction algorithms. Staying with the $1 / 8$ ths pricing regime, we see the EMO algorithm with no lag for benchmark quote assignment reduces biases to $6.6 \%$ $(=12.78 / 11.99-1)$. For most categories, the EMO algorithm outperforms the LR algorithm and the Lee/Ready estimator. (For each row in the table we have bolded the cell with the least bias.) (found in [[@petersonEvaluationBiasesExecution2003]] 278)
+
+**Why we use the midpoint:** “In the presence of trading frictions, the transaction price P typically differs from the fundamental value X. The effective spread quantifies the difference, and may be viewed as a premium paid for the service of immediacy in securities trading. The nominal effective spread is defined as S = 2D(P − X ), (1) where D is a direction of trade indicator taking the value +1 for buyer-initiated trades, and -1 for seller-initiated trades. The multiplication by two is for consistency with the quoted bid-ask spread (defined below for a hypothetical roundtrip trade). For ease of exposition, I suppress stock and time subscripts for all variables in this section. Because the fundamental value at the time of transaction is unobservable, the effective spread is estimated relative to a proxy, which I denote ̃ X. An effective spread estimator can then be defined as ̃ S = 2D(P − ̃ X). (2) Various fundamental value proxies are distinguished with the superscript v, ̃ Xv. For example, I denote the midpoint ̃ Xmid. Similarly, an effective spread estimator utilizing the fundamental value estimator v is denoted ̃ Sv. The midpoint effective spread, as defined by Blume and Goldstein (1992) and Lee (1993) as well as in the RegNMS Rule 605, is thus denoted ̃ Smid . An effective spread estimator is unbiased if the expected difference between the expressions in (1) and (2) is zero. The expected difference is E( ̃ S − S) = 2E(D(X − ̃ X)), (3) implying that the effective spread estimator is unbiased if and only if D and (X − ̃ X ) are uncorrelated. This can be expected either if investors are unable to assess the sign of (X − ̃ X ), or if the liquidity demand elasticity is zero. Consider again the example in the introduction. Whe” ([[@hagstromerBiasEffectiveBidask2021]], p. 317)
+
+**reasons why we use the midpoint:** “The appeal of the midpoint is arguably data availability and simplicity. Data on the best bid and ask prices are publicly available for many asset classes and market types (both auction and dealer markets) and in long time series. In markets where the quotes are valid until canceled, such as limit order book markets, midpoint observations are available continuously during trading hours. Furthermore, the midpoint is straightforward to compute in real time, and is easy to understand for all market participants.” ([[@hagstromerBiasEffectiveBidask2021]], p. 317)
+
+**Definition:** "We also define the effective spread, a commonly used proxy for transaction costs. Recall that $m_t$ is the midpoint of the bid and ask quotes.
+
+Definition 2: The true transaction cost faced by a market order of size $\bar{x}$ at time tis
+$$
+C_t(\bar{x})=\left(P_t(\bar{x})-v_t\right) \operatorname{sign}(\bar{x}) .
+$$
+The effective spread, $S_t(\bar{x})$, faced by a market order of size $\bar{x}$ at time $t$ is
+$$
+S_t(\bar{x})=\left(P_t(\bar{x})-m_t\right) \operatorname{sign}(\bar{x}) .
+$$
+The true transaction cost is necessarily unobservable in real data. In many econometric specifications (see Hasbrouck (2002) for a summary), the execution price is decomposed into the sum of the "efficient price" and microstructure effects. In our model, the efficient price is just the consensus value, $v_t$. Thus, our transaction cost $C_t$ is simply the microstructure effect times the signed order flow. Our sellers pay a cost if the transaction price is greater than the consensus value and receive a discount if it is below. Consider a trade that occurs at time $t$. The consumer surplus that accrues to the market order and limit order submitters is a measure of the net change in their welfare. ${ }^{25}$ Recall that $\bar{x}>0$ indicates a market buy order, and $\bar{x}<0$ a market sell order." (from [[@goettlerEquilibriumDynamicLimit2005]] 2173)
\ No newline at end of file
diff --git "a/references/obsidian/\360\237\223\221notes/\360\237\224\242Depth rule notes.md" "b/references/obsidian/\360\237\223\221notes/\360\237\224\242Depth rule notes.md"
index a395ae4e..e1b6c078 100644
--- "a/references/obsidian/\360\237\223\221notes/\360\237\224\242Depth rule notes.md"	
+++ "b/references/obsidian/\360\237\223\221notes/\360\237\224\242Depth rule notes.md"	
@@ -7,6 +7,8 @@ Tags: #trade-classification
 **motivation:**
 - “We hypothesize that a larger bid or ask quoted size, i.e., a higher depth at the best bid or ask, indicates a higher liquidity similar to a tighter bid or ask quote” ([[@grauerOptionTradeClassification2022]]), p. 14)
 - “As a consequence, we classify midspread trades as buyer-initiated, if the ask size exceeds the bid size, and as seller-initiated, if the bid size is higher than the ask size. If the ask size matches the bid size, midspread trades still cannot be classified by this approach” ([[@grauerOptionTradeClassification2022]]), p. 14)
+- Seems like there is a broader concept. Found in Hagströmer: “I propose two alternative effective spread estimators that overcome the problem of discrete prices. For these, I rely on continuous fundamental value proxies that factor in the relative quantities posted at the best bid and ask prices, which I refer to as the “order book imbalance. The motivation is that the depth at a given price depends on a tradeoff between the liquidity suppliers’ revenue at execution (the expected effective spread) and the costs of trading with informed traders (as in Glosten, 1994). If the bidside spread is tight, market makers infer that the potential revenue is low and then quote relatively low bid-price quantities. Empirical evidence of such quoting schemes are provided by Kavajecz (1999) and Sandås (2001).” ([[@hagstromerBiasEffectiveBidask2021]], p. 315)
+- “Why the order book imbalance is potentially useful to track the fundamental value is best understood from a liquidity-supplier perspective. Although the bias considered here is driven in large part by the price sensitivity of liquidity demanders, the depths at the best quotes are determined by the distance to the fundamental value. In the model by Glosten (1994), the optimal depth in the limit order book is based on a tradeoff between the revenues expected from earning the effective spread, and the costs of trading with informed market orders. In a setting where liquidity demanders are potentially informed, and where the market order size depends on their marginal valuation and the terms of trade offered in the limit order book, Glosten (1994, Proposition 2) shows that the depth posted at a given price level, in equilibrium, is increasing in the distance to the fundamental value.4 That is, if the bid depth is lower than the ask depth, it indicates that the fundamental value is closer to the bid than to the ask price.” (Hagströmer, 2021, p. 318)
 
 **limitation:** 👩‍🚒 only a proxy for tick rule. Must be combined with other rules.
 
diff --git "a/references/obsidian/\360\237\223\221notes/\360\237\224\242Trade Initiator notes.md" "b/references/obsidian/\360\237\223\221notes/\360\237\224\242Trade Initiator notes.md"
index 91ebd134..5d5519aa 100644
--- "a/references/obsidian/\360\237\223\221notes/\360\237\224\242Trade Initiator notes.md"	
+++ "b/references/obsidian/\360\237\223\221notes/\360\237\224\242Trade Initiator notes.md"	
@@ -8,10 +8,15 @@ relates to [[🌏Ise dataset]]
 - “We take advantage of the fact that if there were only customer buy (sell) orders on a specific day for a given option series at one particular exchange, Open/Close data allows to classify all transactions in the LiveVol dataset on that day at the respective exchange as buy (sell) orders.” ([[@grauerOptionTradeClassification2022]], p. 8)
 - https://www.cboe.com/us/equities/trading/offerings/order_types_and_routing/
 - “We take advantage of the fact that if there were only customer buy (sell) orders on a specific day for a given option series at one particular exchange, Open/Close data allows to classify all transactions in the LiveVol dataset on that day at the respective exchange as buy (sell) orders.” (Grauer et al., 2022, p. 8) “Consistent with previous literature, we assume that the customer is the party with a demand for options.” (Grauer et al., 2022, p. 8) “Therefore, we use the customer buy/sell indicator obtained from Open/Close data as the benchmark to empirically validate the accuracy of trade classification methods applied to intraday option transactions from LiveVol.” (Grauer et al., 2022, p. 8)
+- “For each record in the matched trade sample, one of five trader types is available: M, C, B, F, and N. The letter represents the account to which a trade is assigned. CBOE market makers are indicated by M, and public customer accounts are denoted by C. Broker-dealer accounts (as distinct from market maker accounts) are represented by B and are defined according to the SEC’s definition of a broker-dealer. These trades are done on their own behalf, not as agents (which would be listed as C trades). F and N indicate firm and non-member accounts. It is worth noting that CBOE members representing large financial conglomerates, such as Merrill Lynch or Fidelity, might trade as M, B, or F, depending on which account is to be assigned to the trade. The data do not provide information on the identity of each trader, only the type.” ([[@savickasInferringDirectionOption2003]], 2003, p. 884)
+
 
 Following a common track in literature, we 
 
 ## Views
+- “In the literature, researchers use different definitions of trade initiators based presumably on data availability. Odders-White (2000) considers the last arriving order to be the trade initiator. She can make this determination because the TORQ database includes the NYSE audit file, which contains order-entry time for both sides of the trade. Papers such as Lee (1992) and Petersen and Fialkowski (1994) consider the active side to be market orders. Kraus and Stoll (1972) consider the active side to be the side with fewer parties. Finucane (2000) and Lee and Radhakrishna (2000) note many orders cannot be unambiguously defined as buyeror seller-initiated. Finucane (2000) finds that nearly one-fourth of all trades do not occur as the result of the arrival of a market order. In his final analysis, Finucane (2000) examines trades with at least one standard non-tick sensitive buy or sell market order in the trade. Ellis et al. (2000) and Theissen (2000) take the approach of inferring trade direction fromthe side contra to the dealer.” ([[@petersonEvaluationBiasesExecution2003]], p. 263)
+
+“Because we do not have access to the NYSE audit file, we cannot define a trade initiator in the same way as those who have used TORQ data. Therefore, our approach will be to begin with all regular-way orders and exclude orders that are most likely not initiators. The following orders are excluded: (a.) limit orders that are not ‘marketable’, that is buy orders with limit price less than the ask or sell orders with limit price greater than the bid, (b.) tick sensitive orders because they usually do not initiate trades, (c.) stopped,3 or guaranteed orders, because these orders tend to be more like limit orders, and (d.) partial executions of marketable limit orders for more shares than are at the best quote and execute in multiple parts.” ([[@petersonEvaluationBiasesExecution2003]], p. 264)
 - There are different views of what is considered as buyer / seller initiated i. e. [[@odders-whiteOccurrenceConsequencesInaccurate2000]] vs. [[@ellisAccuracyTradeClassification2000]]
 (see [[@theissenTestAccuracyLee2000]] for more details)
 - “The goal of the trade side classification is to determine the initiator of the transaction and to classify trades as being either buyer or seller motivated. However, a formal definition of a trade initiator is rarely stated in the literature. For example, the so-called “immediacy” definition describes initiators as traders who demand immediate execution (e.g. Lee and Radhakrishna, 2000). According to Odders-White (2000), the initiator of a transaction is the investor (buyer or seller) who placed his/her order last, chronologically (the so called “chronological” definition). These two definitions are equivalent in many cases. In both definitions, the initiator is the person who caused the transaction to occur.” ([[@olbrysEvaluatingTradeSide2018]]  p. 4)
diff --git "a/references/obsidian/\360\237\223\221notes/\360\237\233\214 Token embeddings notes.md" "b/references/obsidian/\360\237\223\221notes/\360\237\233\214 Token embeddings notes.md"
index ffc4b35a..376ec760 100644
--- "a/references/obsidian/\360\237\223\221notes/\360\237\233\214 Token embeddings notes.md"	
+++ "b/references/obsidian/\360\237\223\221notes/\360\237\233\214 Token embeddings notes.md"	
@@ -21,6 +21,25 @@ print(cosine_sim)
 ```
 
 
+## Numerical embeddings: Why an how?
+See: https://blog.ayoungprogrammer.com/2018/01/deep-recurrent-neural-networks-for-mathematical-sequence-prediction.html
+Encoding numerical inputs for neural networks is difficult because the representation space is very large and there is no easy way to embed numbers into a smaller space without losing information. Some of the ways to currently handle this is:
+
+-   Scale inputs from minimum and maximum values to [-1, 1]
+-   One hot for each number
+-   One hot for different bins (e.g. [0-0], [1-2], [3-7], [8 – 19], [20, infty])
+
+In small integer number ranges, these methods can work well, but they don’t scale well for wider ranges. In the input scaling approach, precision is lost making it difficult to distinguish between two numbers close in value. For the binning methods, information about the mathematical properties of the numbers such as adjacency and scaling is lost.
+
+The desideratum of our embeddings of numbers to vectors are as follows:
+
+-   able to handle numbers of arbitrary length
+-   captures mathematical relationships between numbers (addition, multiplication, etc.)
+-   able to model sequences of numbers
+
+In this blog post, we will explore a novel approach for embedding numbers as vectors that include these desideratum.
+
+
 ## Notes from Phuong and Hutter
 (see [[@phuongFormalAlgorithmsTransformers2022]])
 ![[token-embedding.png]]
diff --git "a/references/obsidian/\360\237\223\226chapters/\360\237\215\225Application study.md" "b/references/obsidian/\360\237\223\226chapters/\360\237\215\225Application study.md"
index e85dfaba..3a23fdd1 100644
--- "a/references/obsidian/\360\237\223\226chapters/\360\237\215\225Application study.md"	
+++ "b/references/obsidian/\360\237\223\226chapters/\360\237\215\225Application study.md"	
@@ -1,4 +1,24 @@
-See [[@jurkatisInferringTradeDirections2022]].
-Effective spread calculation. See e. g., [[@ellisAccuracyTradeClassification2000]].
+## Setup
+Albeit the classification accuracy is a reasonable measure for comparing classifiers, one cannot immediately infer how changes in accuracy e. g., an improvement by $1{}\%$, affect the application domains. In an attempt to make our results tangible, we apply all algorithms to estimate trading cost, a problem we previously identified to be reliant on correct trade classification (cp. [[👶Introduction]]) and a common testing ground for trade classification rules (cp. [[@ellisAccuracyTradeClassification2000]]541) and ([[@finucaneDirectTestMethods2000]]569)  and ([[@petersonEvaluationBiasesExecution2003]]271--278) and ([[@savickasInferringDirectionOption2003]]896--897).
 
-“To give the improvement in classification accuracy more economic meaning, I apply the trade classification methods to the estimation of transaction costs. The transaction costs in turn are used in a portfolio optimization exercise. The results show that an investor with a mean-variance utility function would be willing to forgo up to 33 bps on yearly returns to use the proposed algorithm to estimate transaction costs instead of the LR algorithm.” ([[@jurkatisInferringTradeDirections2022]], 2022, p. 7)
\ No newline at end of file
+One of the most widely adopted measures for trading costs is the effective spread ([[@Piwowar_2006]]112). It is defined as the difference between the trade price and the fundamental value of the asset ([[@bessembinderIssuesAssessingTrade2003]]238--239).  Following ([[@bessembinderIssuesAssessingTrade2003]]238--239), we define the *nominal, effective spread* as 
+$$
+S_{i,t} = 2 (P_{i,t} - V_{i,t}) D_{i,t}.
+$$
+Like before, $i$ indexes the security and $t$ denotes the trade. Here, $D_{i,t}$ is the trade direction, which is either $1$ for customer buy orders and $-1$ for customer sell orders. If the trade initiator is known, we set $D_{i,t} = y_{i,t}$ and $D_{i,t}=\hat{y}_{it}$, if inferred from a rule or classifier. As the fundamental value $V_{i,t}$ is unobserved at the time of the trade, we follow a common track in research and use the midpoint of the prevailing quotes as an observable proxy. footnote-(For an alternative treatment for options (cp.[[@muravyevOptionsTradingCosts2020]]4975--4976). Our focus is on the midspread, as it is the most common proxy for the value.) This is also a natural choice, assuming that, on average, the spread is symmetrical and centred around the true fundamental value ([[@leeMarketIntegrationPrice1993]]1018). ~~~([[@hagstromerBiasEffectiveBidask2021]]317) reasons that the appeal of using the midpoint lies in the high data availability, simplicity, and applicability in an online setting.~~ We multiply the so-obtained half-spread by $2$ to obtain the effective spread, which represents the cost for a round trip trade involving a buy and sell ex commissions.
+
+Readily apparent from (cref-eq), poor estimates for the predicted trade direction, lead to an under or over-estimated effective spread, and hence to a skewed trade cost estimate. By comparing the true effective spread from the estimated, we can derive the economical significance. For convenience, we also calculate the *relative effective spread* as 
+$$
+{PS}_{i,t} = S_{i,t} / V_{i,t}.
+$$
+The subsequent section estimates both the nominal and relative effective spread for our test sets.
+
+## Results
+The actual and the estimated effective spreads, as well as the quoted spread, are shown in the (cref tab) aggregated by mean.  ([[@savickasInferringDirectionOption2003]] 896--897) previously estimated the effective spreads on a subset of rules for option trades at the gls-CBOE, which can be compared against.
+
+A $t$-test is used to test if the estimated, effective spread is significantly different from the mean true effective spread / significantly greater than zero at $p=0.01$ (cp.[[@finucaneDirectTestMethods2000]]570). Alternatively compare correlations $\rho$ and medians using the Wilcoxon test with null hypothesis of the equal medians with $p=0.01$ (cp.[[@theissenTestAccuracyLee2000]]12).
+
+(🔥What can we see? How do the results compare?)
+
+**Notes:**
+[[🍕Application study notes]]
\ No newline at end of file
diff --git "a/references/obsidian/\360\237\223\226chapters/\360\237\224\242Quote rule.md" "b/references/obsidian/\360\237\223\226chapters/\360\237\224\242Quote rule.md"
index eb5a432f..d22c144e 100644
--- "a/references/obsidian/\360\237\223\226chapters/\360\237\224\242Quote rule.md"	
+++ "b/references/obsidian/\360\237\223\226chapters/\360\237\224\242Quote rule.md"	
@@ -1,6 +1,8 @@
 
 <mark style="background: #ADCCFFA6;">“Methods of inferring trade direction can be classified as: tick tests, which use changes in trade prices; the quote method, which compares trade prices to quotes;” (Finucane, 2000, p. 557)</mark>
 
+**In a simple model market orders are executed at the quotes:** “In the simple security markets described in standard financial models [e.g., Roll (1984)], trades take place at the prices posted by the specialist. Since market orders purchase stock at the ask and sell stock at the bid, these orders pay the spread between the bid and the ask. The spread arises due to the costs of and because liquidity providers face some traders who know more about the future value of the security than they [Treynor (1971) and Glosten and Milgrom (1985)]. In U.S. equity markets, however, trades inside the spread are a frequent occurrence.Market orders may transact inside the spread if the specialist does not always display the best public limit orders. Market orders may also trade at prices better than the posted quotes when they are matched with other market orders. Once trades occur inside the posted spread, the posted spread overstates an investor’s expected trading costs. Since investors can expect to buy at prices lower than the ask and sell at prices higher than the bid, the effective spread is the relevant measure of trading costs.” (Petersen and Fialkowski, 1994, p. 210)
+
 The quote rule compares the trade price against the corresponding quotes at the time of the trade. <mark style="background: #FFB86CA6;">(Intuition?)</mark> If the trade price $p_{i,t}$ is above the midpoint of the bid-ask spread, denoted by $m_{i,t}$, the trade is classified as a buy and if it is below the midpoint, as a sell ([[@harrisDayEndTransactionPrice1989]] p.41). Thus, the classification rule, is formally given by:
 $$
 %\tag{10}
diff --git "a/references/obsidian/\360\237\223\226chapters/\360\237\233\214Token Embedding.md" "b/references/obsidian/\360\237\223\226chapters/\360\237\233\214Token Embedding.md"
index 14d11303..518f743d 100644
--- "a/references/obsidian/\360\237\223\226chapters/\360\237\233\214Token Embedding.md"	
+++ "b/references/obsidian/\360\237\223\226chapters/\360\237\233\214Token Embedding.md"	
@@ -22,7 +22,9 @@ e_{\text{king}}&=W_e[:,2] = [0.01, 0.20, 0.134]^T\\
 e_{\text{queen}}&=W_e[:,1] = [0.07, 0.157, 0.139]^T\\
 \end{aligned}
 $$
-are likely to be close in space with cosine-similarity of $\approx 1$ due to their high semantic similarity. Embeddings can only encode the semantic relationship of tokens, but they do not provide a clue to the model about the relative and absolute ordering of tokens in which they appear in the sequence, since all stages of the encoder and decoder are invariant to the token's position (see [[@tunstallNaturalLanguageProcessing2022]] (p. 72) or [[@phuongFormalAlgorithmsTransformers2022]]). To preserve the ordering, positional information must be induced to the model using a [[🧵Positional Embedding]]. Another limitation of embeddings is, that identical tokens share their embedding, even if they are ambiguous and their meaning is different from the context in which they appear. To resolve this issue, embeddings get contextualized in the self-attention mechanism (see chapter [[🅰️Attention]]).
+are likely to be close in space with cosine-similarity of $\approx 1$ due to their high semantic similarity. 
+
+Embeddings can only encode the semantic relationship of tokens, but they do not provide a clue to the model about the relative and absolute ordering of tokens in which they appear in the sequence, since all stages of the encoder and decoder are invariant to the token's position (see [[@tunstallNaturalLanguageProcessing2022]] (p. 72) or [[@phuongFormalAlgorithmsTransformers2022]]). To preserve the ordering, positional information must be induced to the model using a [[🧵Positional Embedding]]. Another limitation of embeddings is, that identical tokens share their embedding, even if they are ambiguous and their meaning is different from the context in which they appear. To resolve this issue, embeddings get contextualized in the self-attention mechanism (see chapter [[🅰️Attention]]).
 
 Our running example uses word embeddings, motivated by the domain in which transformers were proposed. However, the novel idea of capturing semantics as embedding vectors extends to other discrete entities, as we explore in chapter [[💤Embeddings For Tabular Data]].
 
@@ -31,6 +33,27 @@ Our running example uses word embeddings, motivated by the domain in which trans
 [^1:]There is a subtle difference between tokens and words. A token can be words including punctuation marks. But words can also be split into multiple tokens, which are known as sub-words. To decrease the size of the vocabulary, words may be reduced to their stems, lower-cased, and stop words be removed. See <mark style="background: #FF5582A6;">(...)</mark> for in-depth coverage of pre-processing techniques.
 
 [^2:] Throughout this work, we adhere to a notation suggested in [[@phuongFormalAlgorithmsTransformers2022]] (p. 1 f) to maintain consistency.
+------
+In the chapter [[🤖Transformer]] we have shown that processing token embeddings and contextualizing them, is the core idea behind Transformers. Yet, [[🛌Token Embedding]]s  are tailored towards textual data. With all tokens coming from the same vocabulary, a homogeneous embedding procedure suffices. As this work is concerned with trade classification on tabular datasets containing both numerical and categorical features, the aforementioned concept is not directly applicable and must be evolved to a generic feature embedding. We do this separately for categorical and numerical features.
+
+Tabular data is flexible with regard to the columns, their data type, and their semantics. While features maintain a shared meaning across rows or samples, no universal semantics can be assumed across columns. For instance, every sample in a trade data set may contain the previous trade price, yet the meaning of the trade price is different from other columns, urging the need for heterogeneous embeddings. 
+
+**Numerical embedding** 🔢
+Columns may be categorical or numerical. Transformer-like architectures handle numerical features by mapping the scalar value to a high-dimensional embedding vector and process sequences thereof [[@gorishniyEmbeddingsNumericalFeatures2022]]. In the simplest case, a learned linear projection is utilized to obtain the embedding. Linear embeddings of numerical features were previously explored in [[@kossenSelfAttentionDatapointsGoing2021]], [[@somepalliSAINTImprovedNeural2021]], [[@chengWideDeepLearning2016]], or [[@gorishniyRevisitingDeepLearning2021]]. More sophisticated approaches rely on parametric embeddings, like the *piece-wise linear encoding* or the *periodic encoding* of [[@gorishniyEmbeddingsNumericalFeatures2022]]. Both enforce non-linear mapping. [[@gorishniyEmbeddingsNumericalFeatures2022]] show that these can alleviate the model's performance but at an additional computational cost. Alternatively, numerical features can be processed as a scalar in non-transformer-based networks and therefore independent from other features. We explore this idea as part of our discussion on [[🤖TabTransformer]]. 
+
+Despite this simpler alternative, numerical embeddings are desirable, as a recent line of research, e. g.,  [[@gorishniyEmbeddingsNumericalFeatures2022]] and [[@somepalliSAINTImprovedNeural2021]] suggests, that numerical embedding can significantly improve performance and robustness to missing values or noise of Transformers. Exemplary, [[@somepalliSAINTImprovedNeural2021]] report an increase *AUC* (ROC) from 89.38 % to 91.72 % merely through embedding numerical features. Their work however offers no theoretical explanation. [[@grinsztajnWhyTreebasedModels2022]] (p. 8f.) fill this void. The authors find, that the mere use of embeddings breaks rotation invariance. *Rotational invariance* in the spirit of [[@ngFeatureSelectionVs2004]] refers to the model's dependency,  <mark style="background: #FFF3A3A6;">(...)</mark>.
+
+**Categorical embeddings** 🗃️
+Recall from the chapter [[🍪Selection Of Supervised Approaches]] that categorical data is data, that is divided into groups. In the context of trade classification, the option type is categorical and takes values $\{\text{'C'},\text{'P'}\}$ for calls and puts. Similar to a token, a category, e. g., $\text{'P'}$ in the previous example, must be represented as a multi-dimensional vector to be handled by the Transformer. Even when processed in other types of neural networks, categories need to be converted to real-valued inputs first, in order to optimize parameters with gradient descent.
+
+A classical strategy is to apply one-hot-encoding to categorical features, whereby each category is mapped to a sparse vector, which can then be processed by a neural network. While this approach is conceptually simple and frequently employed in neural network architectures, it has several drawbacks like resulting in sparse vectors, where the cardinality of feature directly affects the one-hot vector. For instance, applying one-hot-encoding to the categorical underlyings $\texttt{GOOGL}$ (Alphabet Inc.), $\texttt{MSFT}$ (Microsoft Inc.), and $\texttt{K}$ (Kellogg Company) would result in sparse vectors equidistant in terms of cosine distance. Naturally, one would expect a greater similarity between the first two underlyings due to the overlapping field of operations. 
+
+For Transformers learned, categorical embeddings are common, which are a direct adaption of the token embeddings ([[@wangTransTabLearningTransferable]], [[@gorishniyRevisitingDeepLearning2021]], [[@huangTabTransformerTabularData2020]], [[@somepalliSAINTImprovedNeural2021]]). A category is mapped to an embedding vector using a learned, embedding matrix, as in Equation [[🛌Token Embedding#^4bee48]]. These embeddings can potentially capture intrinsic properties of categorical variables by arranging similar items closer in the embedding space. For high cardinal variables, learned embeddings also have the advantage of being memory efficient, as the length of the embedding vector is untied from the cardinality of the variable [[@guoEntityEmbeddingsCategorical2016]] (p. 1). Despite these advantages, learned, categorical embeddings still lack a sound theoretical foundation and remain an open research problem [[@hancockSurveyCategoricalData2020]] (p. 28). In a similar vein, [[@borisovDeepNeuralNetworks2022]] (p. 2) note, that handling high-dimensional categoricals has not been resolved by existing approaches. Being dependent on a few samples, high cardinality is equally problematic for learned embeddings. We come back to this issue in later chapters.
+
+Like in chapter [[🛌Token Embedding]] the dimension of the embedding $e_{d}$ affects the expressiveness of the network and is a tunable hyerparameter. One major drawback of learned embeddings is, that they contribute to the parameter count of the model through the embedding matrix or increased layer capacity of subsequent layers. 
+
+To this end, embeddings are non-exclusive to Transformer-based architectures, and can be used in other deep learning-based approaches, and even classical machine learning models, like [[🐈Gradient Boosting]]. Covering these combinations is outside the scope of this work. We refer the reader to [[@gorishniyEmbeddingsNumericalFeatures2022]] for an in-depth comparison. Next, our focus is on two concrete examples of Transformers for tabular data.
+
 
 **Notes:**
 [[🛌 Token embeddings notes]]
\ No newline at end of file
diff --git "a/references/obsidian/\360\237\223\226chapters/\360\237\252\246graveyard of ideas.md" "b/references/obsidian/\360\237\223\226chapters/\360\237\252\246graveyard of ideas.md"
index d98ea07d..8682504d 100644
--- "a/references/obsidian/\360\237\223\226chapters/\360\237\252\246graveyard of ideas.md"	
+++ "b/references/obsidian/\360\237\223\226chapters/\360\237\252\246graveyard of ideas.md"	
@@ -1,3 +1,13 @@
+## Embeddings
+
+
+
+## Applicaton study
+See [[@jurkatisInferringTradeDirections2022]].
+Effective spread calculation. See e. g., [[@ellisAccuracyTradeClassification2000]].
+
+“To give the improvement in classification accuracy more economic meaning, I apply the trade classification methods to the estimation of transaction costs. The transaction costs in turn are used in a portfolio optimization exercise. The results show that an investor with a mean-variance utility function would be willing to forgo up to 33 bps on yearly returns to use the proposed algorithm to estimate transaction costs instead of the LR algorithm.” ([[@jurkatisInferringTradeDirections2022]], 2022, p. 7)
+
 ## Transformer
 
 At times we fall back to the Transformer for machine translations, to develop a deeper understanding of the architecture and its components.
diff --git "a/references/obsidian/\360\237\223\245Inbox/@Piwowar_2006.md" "b/references/obsidian/\360\237\223\245Inbox/@Piwowar_2006.md"
new file mode 100644
index 00000000..c02b1f79
--- /dev/null
+++ "b/references/obsidian/\360\237\223\245Inbox/@Piwowar_2006.md"
@@ -0,0 +1,13 @@
+*title:* The sensitivity of effective spread estimates to Trade–Quote matching algorithms
+*authors:* Michael S. Piwowar, Li Wei
+*year:* 2006
+*tags:* 
+*status:* #📥
+*related:*
+*code:*
+*review:*
+
+## Notes 📍
+
+## Annotations 📖
+Note: 
\ No newline at end of file
diff --git "a/references/obsidian/\360\237\223\245Inbox/@goettlerEquilibriumDynamicLimit2005.md" "b/references/obsidian/\360\237\223\245Inbox/@goettlerEquilibriumDynamicLimit2005.md"
new file mode 100644
index 00000000..21d75e13
--- /dev/null
+++ "b/references/obsidian/\360\237\223\245Inbox/@goettlerEquilibriumDynamicLimit2005.md"
@@ -0,0 +1,13 @@
+*title:* Equilibrium in a Dynamic Limit Order Market
+*authors:* Ronald L. Goettler, Christine A. Parlour, Uday Rajan
+*year:* 2005
+*tags:* 
+*status:* #📥
+*related:*
+*code:*
+*review:*
+
+## Notes 📍
+
+## Annotations 📖
+Note: 
\ No newline at end of file
diff --git "a/references/obsidian/\360\237\223\245Inbox/@hagstromerBiasEffectiveBidask2021.md" "b/references/obsidian/\360\237\223\245Inbox/@hagstromerBiasEffectiveBidask2021.md"
new file mode 100644
index 00000000..723cc638
--- /dev/null
+++ "b/references/obsidian/\360\237\223\245Inbox/@hagstromerBiasEffectiveBidask2021.md"
@@ -0,0 +1,13 @@
+*title:* Bias in the effective bid-ask spread
+*authors:* Björn Hagströmer
+*year:* 2021
+*tags:* 
+*status:* #📥
+*related:*
+*code:*
+*review:*
+
+## Notes 📍
+
+## Annotations 📖
+Note: 
\ No newline at end of file
diff --git "a/references/obsidian/\360\237\223\245Inbox/@leeMarketIntegrationPrice1993.md" "b/references/obsidian/\360\237\223\245Inbox/@leeMarketIntegrationPrice1993.md"
new file mode 100644
index 00000000..6b22e4e5
--- /dev/null
+++ "b/references/obsidian/\360\237\223\245Inbox/@leeMarketIntegrationPrice1993.md"
@@ -0,0 +1,18 @@
+*title:* Market Integration and Price Execution for NYSE-Listed Securities
+*authors:* Charles M. C. Lee
+*year:* 1993
+*tags:* #liquidity #effective-spread
+*status:* #📦 
+*related:*
+- [[@hagstromerBiasEffectiveBidask2021]] (made me aware of this paper)
+*code:*
+*review:*
+
+## Notes 📍
+
+## Annotations 📖
+Note: 
+
+“They also compute a liquidity premium per trade based on the absolute difference between the trade price and the quote "midspread."” ([Lee, 1993, p. 1016](zotero://select/library/items/QMJI4B6K)) ([pdf](zotero://open-pdf/library/items/YFZIMMTT?page=8&annotation=DT3XB943))
+
+“In the first set of tests, the liquidity premium for each trade is computed as the absolute difference between the actual trade price and the "midspread" (average of bid and ask prices) of the prevailing quote at the time of the trade. Assuming that, on average, the specialist's spread is set symmetrically around the equilibrium price, the liquidity premium provides an estimate of the effective "half-spread" for each trade."” ([Lee, 1993, p. 1018](zotero://select/library/items/QMJI4B6K)) ([pdf](zotero://open-pdf/library/items/YFZIMMTT?page=10&annotation=92M79LI2))
\ No newline at end of file
diff --git "a/references/obsidian/\360\237\223\245Inbox/@muravyevOptionsTradingCosts2020.md" "b/references/obsidian/\360\237\223\245Inbox/@muravyevOptionsTradingCosts2020.md"
new file mode 100644
index 00000000..5105d470
--- /dev/null
+++ "b/references/obsidian/\360\237\223\245Inbox/@muravyevOptionsTradingCosts2020.md"
@@ -0,0 +1,13 @@
+*title:* Options Trading Costs Are Lower than You Think
+*authors:* Dmitriy Muravyev, Neil D Pearson
+*year:* 2020
+*tags:* 
+*status:* #📥
+*related:*
+*code:*
+*review:*
+
+## Notes 📍
+
+## Annotations 📖
+Note: 
\ No newline at end of file
diff --git "a/references/obsidian/\360\237\223\245Inbox/@petersenPostedEffectiveSpreads1994.md" "b/references/obsidian/\360\237\223\245Inbox/@petersenPostedEffectiveSpreads1994.md"
new file mode 100644
index 00000000..99bb5b3b
--- /dev/null
+++ "b/references/obsidian/\360\237\223\245Inbox/@petersenPostedEffectiveSpreads1994.md"
@@ -0,0 +1,31 @@
+*title:* Posted versus effective spreads
+*authors:* Mitchell A. Petersen, David Fialkowski
+*year:* 1994
+*tags:* #effective-spread #trade-classification 
+*status:* #📦 
+*related:*
+*code:*
+*review:*
+
+## Notes 📍
+
+## Annotations 📖
+Note: 
+
+“When trades are executed inside the posted bid-ask spread, the posted spread is no longer an accurate measure of transactions costs faced by investors.” ([Petersen and Fialkowski, 1994, p. 209](zotero://select/library/items/2CQEEYUA)) ([pdf](zotero://open-pdf/library/items/ICR74ERY?page=1&annotation=J2HT2P2F))
+
+“In the simple security markets described in standard financial models [e.g., Roll (1984)], trades take place at the prices posted by the specialist. Since market orders purchase stock at the ask and sell stock at the bid, these orders pay the spread between the bid and the ask. The spread arises due to the costs of Correspondence fo: Mitchell Petersen, GSB, University of Chicago, 1101 E. 58th Street, Chicago, IL 60637, USA. \*Petersen acknowledges financial support from the Center for Research in Security Prices at the University of Chicago. We would like to thank William Abrams, Bob Anstiss, Dave Belding, Beverly Clingan, Roger Hendrick, and Terence Meehan and seminar participants at London Business School, Northwestern University, Southern Methodist University, University of Chicago, and University of Florida for many useful insights. The constructive criticism of Jerold Warner (the editor) and Michael Barclay (the referee) have significantly improved the paper. We express our appreciation for very efficient programming by Greg May and Paul Ho. The views in this paper are those of the authors and do not necessarily reflect the views of the institutions discussed, or of CRSP. 0304-405X/94/$07.00 0 1994-Elsevier Science B.V. All rights reserve” ([Petersen and Fialkowski, 1994, p. 209](zotero://select/library/items/2CQEEYUA)) ([pdf](zotero://open-pdf/library/items/ICR74ERY?page=1&annotation=29UAA825))
+
+“270 M.A. Petersen and D. Fialkowski, Posted versus effective spreads maintaining inventories [Ho and Stoll (1981)], and because liquidity providers face some traders who know more about the future value of the security than they [Treynor (1971) and Glosten and Milgrom (1985)]. In U.S. equity markets, however, trades inside the spread are a frequent occurrence.” ([Petersen and Fialkowski, 1994, p. 210](zotero://select/library/items/2CQEEYUA)) ([pdf](zotero://open-pdf/library/items/ICR74ERY?page=2&annotation=SYKCWV7Y))
+
+“Market orders may transact inside the spread if the specialist does not always display the best public limit orders. Market orders may also trade at prices better than the posted quotes when they are matched with other market orders. Once trades occur inside the posted spread, the posted spread overstates an investor’s expected trading costs. Since investors can expect to buy at prices lower than the ask and sell at prices higher than the bid, the effective spread is the relevant measure of trading costs.” ([Petersen and Fialkowski, 1994, p. 210](zotero://select/library/items/2CQEEYUA)) ([pdf](zotero://open-pdf/library/items/ICR74ERY?page=2&annotation=XWH4JZH6))
+
+“Lee (1993) and Blume and Goldstein (1992b) compare transaction prices to the midpoint of the quoted spread. This method accurately estimates the effective spread only when market orders are matched solely with limit orders (the specialist or a public limit order). If market orders are matched with each other, then this method will overestimate the spread actually paid. Some of the market orders will pay significantly less than the posted spread.” ([Petersen and Fialkowski, 1994, p. 210](zotero://select/library/items/2CQEEYUA)) ([pdf](zotero://open-pdf/library/items/ICR74ERY?page=2&annotation=YUGHMMXD))
+
+“Since these stocks are traded on many exchanges, there are many bids and asks for each equity issue. Price improvement can be calculated relative to th” ([Petersen and Fialkowski, 1994, p. 214](zotero://select/library/items/2CQEEYUA)) ([pdf](zotero://open-pdf/library/items/ICR74ERY?page=6&annotation=Q7EUDCEZ))
+
+“M.A. Petersen and D. Fialkowski, Posted versus effective spreads 215 best bid or offer (BBO). Across all exchanges the best bid is the highest bid price; the best offer is the lowest ask price.” ([Petersen and Fialkowski, 1994, p. 215](zotero://select/library/items/2CQEEYUA)) ([pdf](zotero://open-pdf/library/items/ICR74ERY?page=7&annotation=63N3SZC9))
+
+“To capture the true spread faced by traders, we use the measured price improvement to calculate the effective spread.” ([Petersen and Fialkowski, 1994, p. 215](zotero://select/library/items/2CQEEYUA)) ([pdf](zotero://open-pdf/library/items/ICR74ERY?page=7&annotation=9HZCDK8E))
+
+“A trader may receive price improvement both buying and selling. Thus, an estimate of the effective spread faced by a trader is Effective spread = Posted spread - 2 x Price improvement. (3) Intuitively, the effective spread is the expected purchase price minus the expected sales price.” ([Petersen and Fialkowski, 1994, p. 215](zotero://select/library/items/2CQEEYUA)) ([pdf](zotero://open-pdf/library/items/ICR74ERY?page=7&annotation=FU66S8PN))
\ No newline at end of file
diff --git "a/references/obsidian/\360\237\223\245Inbox/@petersonEvaluationBiasesExecution2003.md" "b/references/obsidian/\360\237\223\245Inbox/@petersonEvaluationBiasesExecution2003.md"
new file mode 100644
index 00000000..8b30e3c9
--- /dev/null
+++ "b/references/obsidian/\360\237\223\245Inbox/@petersonEvaluationBiasesExecution2003.md"
@@ -0,0 +1,19 @@
+*title:* Evaluation of the biases in execution cost estimation using trade and quote data
+*authors:* Mark Peterson, Erik Sirri
+*year:* 2003
+*tags:* #effective-spread #trade-classification 
+*status:* #📦 
+*related:*
+*code:*
+*review:*
+
+## Notes 📍
+
+## Annotations 📖
+Note: 
+
+“. Examples of errors in trading cost estimates The trading cost measures considered here are the effective spread and the relative effective spread. The effective spread, which represents the round trip execution costs, less commissions, is calculated as: Effective spread ¼ 2 D ðPrice midpointÞ where D is the trade direction, þ1 for a buy, and 1 for a sell. Using only TAQ data, one must infer D: The midpoint must also be estimated because the TAQ data report the trade time, not the order submission time. As noted in Bacidore et al. (1999), execution quality is most appropriately measured setting the benchmark quote to that prevailing at order submission time. The relative effective spread is calculated as Relative effective spread ¼ Effective spread=price: Next, we demonstrate the possible limitations of trade and quote data in estimating trading costs by providing two specific examples—price improvement in minimum variation markets and quote changes prior to trade execution.” ([Peterson and Sirri, 2003, p. 261](zotero://select/library/items/8H44XMRH)) ([pdf](zotero://open-pdf/library/items/N5WH3RYR?page=3&annotation=9YNKPFT3))
+
+“In the literature, researchers use different definitions of trade initiators based presumably on data availability. Odders-White (2000) considers the last arriving order to be the trade initiator. She can make this determination because the TORQ database includes the NYSE audit file, which contains order-entry time for both sides of the trade. Papers such as Lee (1992) and Petersen and Fialkowski (1994) consider the active side to be market orders. Kraus and Stoll (1972) consider the active side to be the side with fewer parties. Finucane (2000) and Lee and Radhakrishna (2000) note many orders cannot be unambiguously defined as buyeror seller-initiated. Finucane (2000) finds that nearly one-fourth of all trades do not occur as the result of the arrival of a market order. In his final analysis, Finucane (2000) examines trades with at least one standard non-tick sensitive buy or sell market order in the trade. Ellis et al. (2000) and Theissen (2000) take the approach of inferring trade direction fromthe side contra to the dealer.” ([Peterson and Sirri, 2003, p. 263](zotero://select/library/items/8H44XMRH)) ([pdf](zotero://open-pdf/library/items/N5WH3RYR?page=5&annotation=ULLIJ7VP))
+
+“Because we do not have access to the NYSE audit file, we cannot define a trade initiator in the same way as those who have used TORQ data. Therefore, our approach will be to begin with all regular-way orders and exclude orders that are most likely not initiators. The following orders are excluded: (a.) limit orders that are not ‘marketable’, that is buy orders with limit price less than the ask or sell orders with limit price greater than the bid, (b.) tick sensitive orders because they usually do not initiate trades, (c.) stopped,3 or guaranteed orders, because these orders tend to be more like limit orders, and (d.) partial executions of marketable limit orders for more shares than are at the best quote and execute in multiple parts.” ([Peterson and Sirri, 2003, p. 264](zotero://select/library/items/8H44XMRH)) ([pdf](zotero://open-pdf/library/items/N5WH3RYR?page=6&annotation=5IBM4V9P))
\ No newline at end of file
diff --git "a/references/obsidian/\360\237\226\274\357\270\217Media/asymmetric-spread.png" "b/references/obsidian/\360\237\226\274\357\270\217Media/asymmetric-spread.png"
new file mode 100644
index 00000000..9699f1c3
Binary files /dev/null and "b/references/obsidian/\360\237\226\274\357\270\217Media/asymmetric-spread.png" differ
diff --git "a/references/obsidian/\360\237\226\274\357\270\217Media/eff-spread-finucane.png" "b/references/obsidian/\360\237\226\274\357\270\217Media/eff-spread-finucane.png"
new file mode 100644
index 00000000..158badf1
Binary files /dev/null and "b/references/obsidian/\360\237\226\274\357\270\217Media/eff-spread-finucane.png" differ
diff --git "a/references/obsidian/\360\237\226\274\357\270\217Media/effective-spead-options.png" "b/references/obsidian/\360\237\226\274\357\270\217Media/effective-spead-options.png"
new file mode 100644
index 00000000..2768c233
Binary files /dev/null and "b/references/obsidian/\360\237\226\274\357\270\217Media/effective-spead-options.png" differ
diff --git "a/references/obsidian/\360\237\226\274\357\270\217Media/midpoint-spread.png" "b/references/obsidian/\360\237\226\274\357\270\217Media/midpoint-spread.png"
new file mode 100644
index 00000000..717cc5ff
Binary files /dev/null and "b/references/obsidian/\360\237\226\274\357\270\217Media/midpoint-spread.png" differ
diff --git a/reports/Content/bibliography.bib b/reports/Content/bibliography.bib
index d816fda0..38a57934 100644
--- a/reports/Content/bibliography.bib
+++ b/reports/Content/bibliography.bib
@@ -295,6 +295,11 @@ @incollection{bengioPracticalRecommendationsGradientBased2012
   doi = {10.1007/978-3-642-35289-8_26}
 }
 
+@article{bessembinderBidAskSpreadsMeasuring,
+  title = {Bid-{{Ask Spreads}}: {{Measuring Trade Execution Costs}} in {{Financial Markets}}},
+  author = {Bessembinder, Hendrik}
+}
+
 @article{bessembinderIssuesAssessingTrade2003,
   title = {Issues in Assessing Trade Execution Costs},
   author = {Bessembinder, Hendrik},
@@ -902,6 +907,16 @@ @misc{culurcielloMemoryAttentionSequences2018
   howpublished = {https://towardsdatascience.com/memory-attention-sequences-37456d271992}
 }
 
+@article{daiEmbeddingLearning2022,
+  title = {Embedding {{Learning}}},
+  author = {Dai, Ben and Shen, Xiaotong and Wang, Junhui},
+  year = {2022},
+  journal = {Journal of the American Statistical Association},
+  volume = {117},
+  number = {537},
+  doi = {10.1080/01621459.2020.1775614}
+}
+
 @misc{daiTransformerXLAttentiveLanguage2019,
   title = {Transformer-{{XL}}: Attentive Language Models beyond a Fixed-Length Context},
   author = {Dai, Zihang and Yang, Zhilin and Yang, Yiming and Carbonell, Jaime and Le, Quoc V. and Salakhutdinov, Ruslan},
@@ -1476,6 +1491,16 @@ @article{Glosten_1988
   pmid = {null}
 }
 
+@article{goettlerEquilibriumDynamicLimit2005,
+  title = {Equilibrium in a {{Dynamic Limit Order Market}}},
+  author = {Goettler, Ronald L. and Parlour, Christine A. and Rajan, Uday},
+  year = {2005},
+  journal = {The Journal of Finance},
+  volume = {60},
+  number = {5},
+  doi = {10.1111/j.1540-6261.2005.00795.x}
+}
+
 @book{goodfellowDeepLearning2016,
   title = {Deep Learning},
   author = {Goodfellow, Ian and Bengio, Yoshua and Courville, Aaron},
@@ -1608,6 +1633,16 @@ @inproceedings{gyamerahStockMarketMovement2019
   doi = {10.1109/CIFEr.2019.8759062}
 }
 
+@article{hagstromerBiasEffectiveBidask2021,
+  title = {Bias in the Effective Bid-Ask Spread},
+  author = {Hagströmer, Björn},
+  year = {2021},
+  journal = {Journal of Financial Economics},
+  volume = {142},
+  number = {1},
+  doi = {10.1016/j.jfineco.2021.04.018}
+}
+
 @inproceedings{hanAutoEncoderInspiredUnsupervised2018,
   title = {{{AutoEncoder}} Inspired Unsupervised Feature Selection},
   booktitle = {2018 {{IEEE International Conference}} on {{Acoustics}}, {{Speech}} and {{Signal Processing}} ({{ICASSP}})},
@@ -1703,6 +1738,16 @@ @article{hasbrouckTradesQuotesInventories1988
   doi = {10.1016/0304-405X(88)90070-0}
 }
 
+@article{hasbrouckTradingCostsReturns2009,
+  title = {Trading {{Costs}} and {{Returns}} for {{U}}.{{S}}. {{Equities}}: {{Estimating Effective Costs}} from {{Daily Data}}},
+  author = {Hasbrouck, Joel},
+  year = {2009},
+  journal = {The Journal of Finance},
+  volume = {64},
+  number = {3},
+  doi = {10.1111/j.1540-6261.2009.01469.x}
+}
+
 @book{hastietrevorElementsStatisticalLearning2009,
   title = {The Elements of Statistical Learning},
   author = {Hastie, Trevor, Sami and Friedman, Harry and Tibshirani, Robert},
@@ -2285,6 +2330,16 @@ @article{leeInferringTradeDirection1991
   doi = {10.1111/j.1540-6261.1991.tb02683.x}
 }
 
+@article{leeMarketIntegrationPrice1993,
+  title = {Market {{Integration}} and {{Price Execution}} for {{NYSE-Listed Securities}}},
+  author = {Lee, Charles M. C.},
+  year = {1993},
+  journal = {The Journal of Finance},
+  volume = {48},
+  number = {3},
+  doi = {10.1111/j.1540-6261.1993.tb04028.x}
+}
+
 @article{leePseudolabelSimpleEfficient,
   title = {Pseudo-Label: The Simple and Efficient Semi-Supervised Learning Method for Deep Neural Networks},
   author = {Lee, Dong-Hyun},
@@ -2776,6 +2831,17 @@ @misc{monnierCrossvalidationToolsTime2018
   journal = {Medium}
 }
 
+@article{muravyevOptionsTradingCosts2020,
+  title = {Options {{Trading Costs Are Lower}} than {{You Think}}},
+  author = {Muravyev, Dmitriy and Pearson, Neil D},
+  editor = {Van Nieuwerburgh, Stijn},
+  year = {2020},
+  journal = {The Review of Financial Studies},
+  volume = {33},
+  number = {11},
+  doi = {10.1093/rfs/hhaa010}
+}
+
 @article{muravyevOrderFlowExpected2016,
   title = {Order Flow and Expected Option Returns: Order Flow and Expected Option Returns},
   author = {Muravyev, Dmitriy},
@@ -3145,22 +3211,21 @@ @article{perlinPerformanceTickTest2014
   doi = {10.1016/j.qref.2013.07.009}
 }
 
-@article{Petersen_1994,
-  title = {Posted versus Effective Spreads: Good Prices or Bad Quotes?},
-  author = {Petersen, Mitchell A. and Fialkowski, David},
-  year = {1994},
-  journal = {Journal of Financial Economics},
-  doi = {10.1016/0304-405x(94)90034-5},
-  mag_id = {1590007075},
-  pmcid = {null},
-  pmid = {null}
-}
-
 @article{petersenMatrixCookbook,
   title = {The Matrix Cookbook},
   author = {Petersen, Kaare Brandt and Pedersen, Michael Syskind}
 }
 
+@article{petersenPostedEffectiveSpreads1994,
+  title = {Posted versus Effective Spreads},
+  author = {Petersen, Mitchell A. and Fialkowski, David},
+  year = {1994},
+  journal = {Journal of Financial Economics},
+  volume = {35},
+  number = {3},
+  doi = {10.1016/0304-405X(94)90034-5}
+}
+
 @article{petersonEvaluationBiasesExecution2003,
   title = {Evaluation of the Biases in Execution Cost Estimation Using Trade and Quote Data},
   author = {Peterson, Mark and Sirri, Erik},
diff --git a/reports/Content/effective-spread.tex b/reports/Content/effective-spread.tex
new file mode 100644
index 00000000..4192ba16
--- /dev/null
+++ b/reports/Content/effective-spread.tex
@@ -0,0 +1,19 @@
+\begin{tabular}{lllllllllllll}
+    \toprule
+                                                     & Nominal & Relative & $p$-value \\ \midrule
+    Effective Spread                                 &         &          & -         \\
+    Quoted Spread                                    &         &          & -         \\
+    Tick Test                                        &         &          &           \\
+    Rev. Tick Test                                   &         &          &           \\
+    Quote Rule                                       &         &          &           \\
+    \gls{LR}                                         &         &          &           \\
+    Rev. \gls{LR}                                    &         &          &           \\
+    \gls{EMO}                                        &         &          &           \\
+    Rev. \gls{EMO}                                   &         &          &           \\
+    \gls{CLNV}                                       &         &          &           \\
+    Rev. \gls{CLNV}                                  &         &          &           \\
+    Trade Size $\to$ Depth Rule $\to$ Rev. Tick Test &         &          &           \\
+    \gls{GBM}                                        &         &          &           \\
+    TabTransformer                                   &         &          &           \\
+    FT-Transformer                                   &         &          &           \\ \bottomrule
+\end{tabular}
\ No newline at end of file
diff --git a/reports/Content/main.tex b/reports/Content/main.tex
index e0f3eff9..0d951370 100644
--- a/reports/Content/main.tex
+++ b/reports/Content/main.tex
@@ -30,7 +30,10 @@ \section{Related Work (3~p)}\label{sec:related-work}
 \newpage
 \section{Rule-Based Approaches}\label{sec:rule-based-approaches}
 
-Every option trade has a buyer and seller side. For a plethora of problems in option research, it's also vital to determine the party that initiated the transaction. The trade initiator is binary and can either be the seller or the buyer. Consequently, we denote it by $\gls{y} \in \{0,1\}$, whereby $y=0$ indicates a seller-initiated and $y=1$ a buyer-initiated trade. As the trade initiator is commonly not provided with the option data sets, it must be inferred using trade classification algorithms \autocite[][453]{easleyOptionVolumeStock1998}.
+Every option trade has a buyer and seller side. For a plethora of problems in option research, it's also vital to determine the party that initiated the transaction. The trade initiator is binary and can either be the seller or the buyer. Consequently, we denote it by $\gls{y} \in \{-1,1\}$, whereby $y=-1$ indicates a seller-initiated and $y=1$ a buyer-initiated trade. As the trade initiator is commonly not provided with the option data sets, it must be inferred using trade classification algorithms \autocite[][453]{easleyOptionVolumeStock1998}.
+
+% TODO: Write about different views on trade initiator in one sentence. Might be due to technical limitations. Ours is likely similar to Ellis or Theissen -> also Savickas? (found in Peterson and Sirri)
+% TODO: We use $\hat{y}$ to distinguish the predicted from the observed trade initiator.
 
 The following section introduces basic rules for option trade classification. We start with the classical quote and tick rule and continue with the more recent depth and trade size rule. Our focus is on classification rules, that sign trades on a trade-by-trade basis. Consequently, we omit classification rules for aggregated trades, like the \cgls{BVC} algorithm of \textcite[][1466--1468]{easleyFlowToxicityLiquidity2012}.
 
@@ -40,14 +43,14 @@ \subsection{Basic Rules}\label{sec:basic-rules}
 
 \subsubsection{Quote Rule}\label{sec:quote-rule}
 
-The quote rule classifies a trade by comparing the trade price against the corresponding quotes at the time of the trade. We denote the sequence of trade prices of the $i$-th security by $\gls{p}_i = \langle p_{i,1},p_{i,2},\dots,p_{i,T}\rangle$ and the corresponding ask at $t$ by $\gls{a}_{i,t}$ and bid by $\gls{b}_{i,t}$. If the trade price is above the midpoint of the bid-ask spread, denoted by $\gls{m}_{i,t} = \tfrac{1}{2}(b_{i,t} + a_{i,t})$, the trade is classified as a buy and if it is below the midpoint, as a sell \autocite[][41]{harrisDayEndTransactionPrice1989}. Thus, the classification rule on $D = \left\{(i, t) \in \mathbb{N}^2: p_{i,t} \neq m_{i,t}\right\}$ is given by:
+The quote rule classifies a trade by comparing the trade price against the corresponding quotes at the time of the trade. We denote the sequence of trade prices of the $i$-th security by $\gls{P}_i = \langle P_{i,1},P_{i,2},\dots,P_{i,T}\rangle$ and the corresponding ask at $t$ by $\gls{A}_{i,t}$ and bid by $\gls{B}_{i,t}$. If the trade price is above the midpoint of the bid-ask spread, denoted by $\gls{M}_{i,t} = \tfrac{1}{2}(B_{i,t} + A_{i,t})$, the trade is classified as a buy and if it is below the midpoint, as a sell \autocite[][41]{harrisDayEndTransactionPrice1989}. Thus, the classification rule on $D = \left\{(i, t) \in \mathbb{N}^2: P_{i,t} \neq M_{i,t}\right\}$ is given by:
 
 \begin{equation}
-\operatorname{quote}\colon D \to \left\{0, 1\right\},\quad
+\operatorname{quote}\colon D \to \left\{-1, 1\right\},\quad
 \operatorname{quote}(i, t)=
 \begin{cases}
-0, & \text{if}\ p_{i, t}>m_{i, t}\\
-1, & \text{if}\ p_{i, t}<m_{i, t}. \\
+1,  & \text{if}\ P_{i, t}>M_{i, t}\\
+-1, & \text{if}\ P_{i, t}<M_{i, t}. \\
 \end{cases}
 \end{equation}
 
@@ -60,13 +63,13 @@ \subsubsection{Quote Rule}\label{sec:quote-rule}
 
 \subsubsection{Tick Test}\label{sec:tick-test}
 
-A common alternative to the quote rule is the tick test. Based on the rationale that buys increase the trade price and sells lower them, the tick test classifies trades by the change in trade price \autocite[][271]{easleyDiscerningInformationTrade2016}. It was first applied in \textcites[][244]{holthausenEffectLargeBlock1987}[][240]{hasbrouckTradesQuotesInventories1988}. The tick test, $\operatorname{tick}\colon \mathbb{N}^2 \to \left\{0,1\right\}$, is defined as:
+A common alternative to the quote rule is the tick test. Based on the rationale that buys increase the trade price and sells lower them, the tick test classifies trades by the change in trade price \autocite[][271]{easleyDiscerningInformationTrade2016}. It was first applied in \textcites[][244]{holthausenEffectLargeBlock1987}[][240]{hasbrouckTradesQuotesInventories1988}. The tick test, $\operatorname{tick}\colon \mathbb{N}^2 \to \left\{-1, 1\right\}$, is defined as:
 
 \begin{equation}
 \operatorname{tick}(i, t)=
 \begin{cases}
-1, & \text{if}\ t=1 \lor p_{t}>p_{t-1} \\
-0, & \text{if}\ p_{i, t} < p_{i, t-1}\\
+1, & \text{if}\ t=1 \lor P_{t}>P_{t-1} \\
+-1,  & \text{if}\ P_{i, t} < P_{i, t-1}\\
 \operatorname{tick}(i, t-1), & \text{else}.
 \end{cases}
 \label{eq:tick-test}
@@ -76,13 +79,13 @@ \subsubsection{Tick Test}\label{sec:tick-test}
 
 By this means, the tick rule can sign all trades as long as a last differing trade price exists, but the overall precision can be impacted by infrequent trading. Being only dependent on transaction data makes the tick rule highly data-efficient. Waiving any quote data for classification contributes to this efficiency, but also poses a major limitation with regard to trades at the bid or ask, as discussed by \textcite[][557--558]{finucaneDirectTestMethods2000}. For instance, if quotes rise between trades, then a sale at the bid on an uptick or zero uptick is misclassified as a buy by the tick test due to the overall increased trade price. Similarly for falling quotes, buys at the ask on downticks or zero downticks are erroneously classified as a sell.
 
-The reverse tick test, $\operatorname{rtick} \colon \mathbb{N}^2 \to \left\{0, 1\right\}$, is a variant of the tick test proposed in \textcite[][241]{hasbrouckTradesQuotesInventories1988}. It is similar to the tick rule but classifies based on the next, distinguishable trade price.
+The reverse tick test, $\operatorname{rtick} \colon \mathbb{N}^2 \to \left\{-1, 1\right\}$, is a variant of the tick test proposed in \textcite[][241]{hasbrouckTradesQuotesInventories1988}. It is similar to the tick rule but classifies based on the next, distinguishable trade price.
 
 \begin{equation}
 \operatorname{rtick}(i, t)=
 \begin{cases}
-1,& \text{if}\ t+1=T \lor p_{i, t} > p_{i, t+1} \\
-0,& \text{if}\ p_{i, t} < p_{i, t+1}\\
+1,& \text{if}\ t+1=T \lor P_{i, t} > P_{i, t+1} \\
+-1,  & \text{if}\ P_{i, t} < P_{i, t+1}\\
 \operatorname{rtick}(i, t+1), & \text{else}
 \end{cases}
 \label{eq:reverse-tick-test}
@@ -96,16 +99,19 @@ \subsubsection{Tick Test}\label{sec:tick-test}
 
 \subsubsection{Depth Rule}\label{sec:depth-rule}
 
+% TODO: These proxies have in common that they factor in the order book imbalance the relative depth quoted at the best bid and ask prices. If traders care about transaction costs, the relatively wide ask-side spread deters buyers, whereas the tight bid-side spread may attract sellers. There are then more traders submitting market orders at the bid side, and the true effective spread is, on average, smaller than the average midpoint effective spread.
+% TODO: Derive in greater detail why orderbook imbalance makes sense! See my notes from Hagströmer
+
 As \cref{sec:quote-rule} unveils, the tick rule yields significantly lower success rates than the quote rule. For midspread trades, that otherwise cannot be classified by the advantageous quote rule, \textcite[][14]{grauerOptionTradeClassification2022} propose the depth rule.
 
 The depth rule infers the trade initiator from the quoted size at the best bid and ask. Based on the observation that an exceeding bid or ask size relates to higher liquidity at one trade side, trades are classified as a buy (sell) for a larger ask (bid) size \autocite[][14]{grauerOptionTradeClassification2022}.
 
-Let $\gls{a-tilde}_{i,t}$ denote the quoted size of the ask, $\gls{b-tilde}_{i,t}$ of the bid, and $\gls{p-tilde}_{i,t}$ the trade price at $t$ of the $i$-th option. We set the domain as $D = \left\{(i, t) \in \mathbb{N}^2: p_{i,t} = \gls{m}_{i,t} \land \tilde{a}_{i,t} \neq \tilde{b}_{i,t} \right\}$. The depth rule, $\operatorname{depth} \colon D \to \left\{0,1\right\}$, is now defined as:
+Let $\gls{A-tilde}_{i,t}$ denote the quoted size of the ask, $\gls{B-tilde}_{i,t}$ of the bid, and $\gls{P-tilde}_{i,t}$ the trade price at $t$ of the $i$-th option. We set the domain as $D = \left\{(i, t) \in \mathbb{N}^2: P_{i,t} = \gls{M}_{i,t} \land \tilde{A}_{i,t} \neq \tilde{B}_{i,t} \right\}$. The depth rule, $\operatorname{depth} \colon D \to \left\{-1, 1\right\}$, is now defined as:
 \begin{equation}
 \operatorname{depth}(i, t)=
 \begin{cases}
-0, & \text{if}\ \tilde{a}_{i,t} < \tilde{b}_{i,t} \land p_{i, t} = m_{i, t}\\
-1, & \text{if}\ \tilde{a}_{i,t} > \tilde{b}_{i,t} \land p_{i, t} = m_{i, t}. \\
+1, & \text{if}\ \tilde{A}_{i,t} > \tilde{B}_{i,t} \land P_{i, t} = M_{i, t}. \\
+-1,  & \text{if}\ \tilde{A}_{i,t} < \tilde{B}_{i,t} \land P_{i, t} = M_{i, t} \\
 \end{cases}
 \label{eq:depth-rule}
 \end{equation}
@@ -116,17 +122,17 @@ \subsubsection{Depth Rule}\label{sec:depth-rule}
 
 \subsubsection{Trade Size Rule}\label{sec:trade-size-rule}
 
-As \cref{sec:tick-test} derives, quote-based approaches are generally preferred due to their stronger performance. \textcite[][13]{grauerOptionTradeClassification2022} stress, however, that the quote rule systematically misclassifies limit orders, and propose an override. On $D = \left\{(i, t) \in \mathbb{N}^2: \tilde{p}_{i,t} = \tilde{a}_{i,t} \neq \tilde{b}_{i,t} \lor \tilde{p}_{i,t} \neq\tilde{a}_{i,t} = \tilde{b}_{i,t} \right\}$ the trade size rule, $\operatorname{tsize} \colon D \to \left\{0,1\right\}$, is defined as:
+As \cref{sec:tick-test} derives, quote-based approaches are generally preferred due to their stronger performance. \textcite[][13]{grauerOptionTradeClassification2022} stress, however, that the quote rule systematically misclassifies limit orders, and propose an override. On $D = \left\{(i, t) \in \mathbb{N}^2: \tilde{P}_{i,t} = \tilde{A}_{i,t} \neq \tilde{B}_{i,t} \lor \tilde{P}_{i,t} \neq\tilde{A}_{i,t} = \tilde{B}_{i,t} \right\}$ the trade size rule, $\operatorname{tsize} \colon D \to \left\{-1, 1\right\}$, is defined as:
 \begin{equation}
 \operatorname{tsize}(i, t)=
 \begin{cases}
-1, & \text{if}\ \tilde{p}_{i, t} = \tilde{b}_{i, t} \neq \tilde{a}_{i, t}\\
-0, & \text{if}\ \tilde{p}_{i, t} = \tilde{a}_{i, t} \neq \tilde{b}_{i, t}. \\
+1, & \text{if}\ \tilde{P}_{i, t} = \tilde{B}_{i, t} \neq \tilde{A}_{i, t}\\
+-1,  & \text{if}\ \tilde{P}_{i, t} = \tilde{A}_{i, t} \neq \tilde{B}_{i, t}. \\
 \end{cases}
 \label{eq:trade-size-rule}
 \end{equation}
 
-The trade size rule in \cref{eq:trade-size-rule} classifies based on a match between the size of the trade $\tilde{p}_{i, t}$ and the quoted bid and ask sizes. The rationale is, that the market maker tries to fill the limit order of a customer, which results in the trade being executed at the contemporaneous bid or ask, with a trade size equalling the quoted size \autocite[][13]{grauerOptionTradeClassification2022}. When both the size of the ask and bid correspond with the trade size, the result is ambiguous.
+The trade size rule in \cref{eq:trade-size-rule} classifies based on a match between the size of the trade $\tilde{P}_{i, t}$ and the quoted bid and ask sizes. The rationale is, that the market maker tries to fill the limit order of a customer, which results in the trade being executed at the contemporaneous bid or ask, with a trade size equalling the quoted size \autocite[][13]{grauerOptionTradeClassification2022}. When both the size of the ask and bid correspond with the trade size, the result is ambiguous.
 
 \textcite[][13]{grauerOptionTradeClassification2022} obtain an accuracy of \SI{79.92}{\percent} for the subset of option trades at the\cgls{ISE} (\SI{22.3}{\percent} of all trades) that can be signed using the methodology, which elevates the performance by \SI{11}{\percent} for the entire sample. Expectedly, the improvement is highest for trades at the quotes and reverses for trades outside the quote \autocite[][15]{grauerOptionTradeClassification2022}. Based on these results, the trade size rule may only be applied selectively to trades inside or at the quote. Since only a fraction of all trades can be classified with the trade size rule, the rule must be combined with other basic or hybrid rules for complete coverage. The subsequent section introduces four hybrid algorithms, that combine basic rules into more sophisticated algorithms.
 
@@ -164,7 +170,9 @@ \subsection{Hybrid Rules}\label{sec:hybrid-rules}
 
 Popular variants include the \cgls{LR} algorithm, the \cgls{EMO} rule, and the \cgls{CLNV} method. All three algorithms utilize the quote and tick rule to a varying extent, as depicted in \cref{fig:hybrid-lr,fig:hybrid-emo,fig:hybrid-clnv}. Basic rules are selected based on the proximity of the trade price to the quotes. We study all algorithms in detail in \cref{sec:lee-and-ready-algorithm,sec:ellis-michaely-ohara-rule,sec:chakarabarty-li-nguyen-van-ness-method}.
 
-\textcite[][18]{grauerOptionTradeClassification2022} combine basic or hybrid rules through stacking. One such combination is depicted in \cref{fig:hybrid-grauer}. This approach is notably different from the aforementioned algorithms, as the applied rule is no longer dependent on the proximity to the quotes, but rather on the classifiability of the trade with the primary rules and their ordering. We cover this generic approach last.
+
+As put forth by \textcite[][18]{grauerOptionTradeClassification2022}, basic or hybrid rules can be combined through stacking. One such combination is depicted in \cref{fig:hybrid-grauer}. This approach is notably different from the aforementioned algorithms, as the applied rule is no longer dependent on the proximity to the quotes, but rather on the classifiability of the trade with the primary rules and their ordering. We cover this generic approach last.
+
 \subsubsection{Lee and Ready Algorithm}\label{sec:lee-and-ready-algorithm}
 
 The popular \cgls{LR} algorithm \autocite[][745]{leeInferringTradeDirection1991} combines the (reverse) tick test and quote rule into a single rule, which is derived from two observations. First, \textcite[][735--743]{leeInferringTradeDirection1991} observe a higher precision of the quote rule over the tick rule, which makes it their preferred choice. Second, by the means of a simple model, the authors demonstrate that the tick test can correctly classify at least \SI{85.0}{\percent} of all midspread trades if the model's assumptions of constant quotes between trades and the arrival of the market and standing orders following a Poisson process are met.
@@ -172,10 +180,10 @@ \subsubsection{Lee and Ready Algorithm}\label{sec:lee-and-ready-algorithm}
 In combination, the algorithm primarily signs trades according to the quote rule. Trades at the midpoint of the spread, unclassifiable by the quote rule, are classified by the tick rule. Overall:
 
 \begin{equation}
-\operatorname{lr} \colon \mathbb{N}^2 \to \left\{0,1\right\}\quad\operatorname{lr}(i,t)=
+\operatorname{lr} \colon \mathbb{N}^2 \to \left\{-1, 1\right\}\quad\operatorname{lr}(i,t)=
 \begin{cases}
-1, & \text{if}\ p_{i, t} > m_{i, t} \\
-0, & \text{if}\ p_{i, t} < m_{i, t} \\
+1, & \text{if}\ P_{i, t} > M_{i, t} \\
+-1,  & \text{if}\ P_{i, t} < M_{i, t} \\
 \operatorname{tick}(i, t), & \text{else}.
 \end{cases}
 \end{equation}
@@ -189,11 +197,11 @@ \subsubsection{Ellis-Michaely-O'Hara
 
 As such, the \cgls{EMO} algorithm extends the tick rule by classifying trades at the quotes using the quote rule, and all other trades with the tick test. Formally, the classification rule is given by:
 \begin{equation}
-\operatorname{emo} \colon \mathbb{N}^2 \to \left\{0, 1 \right\}, \quad
+\operatorname{emo} \colon \mathbb{N}^2 \to \left\{-1, 1\right\}, \quad
 \operatorname{emo}(i, t)=
 \begin{cases}
-1, & \text{if}\ p_{i, t} = a_{i, t} \\
-0, & \text{if}\ p_{i, t} = b_{i, t} \\
+1, & \text{if}\ P_{i, t} = A_{i, t} \\
+-1,  & \text{if}\ P_{i, t} = B_{i, t} \\
 \operatorname{tick}(i, t), & \text{else}.
 \end{cases}
 \label{eq:emo-rule}
@@ -207,17 +215,19 @@ \subsubsection{Chakrabarty-Li-Nguyen-Van-Ness
 Like the previous two algorithms, the \cgls{CLNV} method of \textcite[][3809]{chakrabartyTradeClassificationAlgorithms2012} is a hybrid of the quote and tick rule and extends the \cgls{EMO} rule by a differentiated treatment of trades inside the quotes, which are notoriously hard to classify. The authors segment the bid-ask spread into deciles (ten equal-width bins) and classify trades around the midpoint (\nth{4} to \nth{7} decile) by the tick rule and trades close or outside the quotes are categorized by the tick rule.
 
 \begin{equation}
-\operatorname{clnv} \colon \mathbb{N}^2 \to \left\{0, 1 \right\}, \quad
+\operatorname{clnv} \colon \mathbb{N}^2 \to \left\{-1, 1\right\}, \quad
 \operatorname{clnv}(i, t)=
 \begin{cases}
-1, & \text{if}\ p_{i, t} \in \left(\frac{3}{10} b_{i,t} + \frac{7}{10} a_{i,t}, a_{i, t}\right] \\
-0, & \text{if}\ p_{i, t} \in \left[ b_{i,t}, \frac{7}{10} b_{i,t} + \frac{3}{10} a_{i,t}\right) \\
+1, & \text{if}\ P_{i, t} \in \left(\frac{3}{10} B_{i,t} + \frac{7}{10} A_{i,t}, A_{i, t}\right] \\
+-1,  & \text{if}\ P_{i, t} \in \left[ B_{i,t}, \frac{7}{10} B_{i,t} + \frac{3}{10} A_{i,t}\right) \\
 \operatorname{tick}(i, t), & \text{else}
 \end{cases}
 \label{eq:CLNV-rule}
 \end{equation}
 
-The algorithm is summarized in \cref{eq:CLNV-rule}. It is derived from a performance comparison of the tick rule (\cgls{EMO} rule) against the quote rule (\cgls{LR} algorithm) on stock data, whereby the accuracy was assessed separately for each decile \footnote{The spread is assumed to be positive and evenly divided into ten deciles and the \nth{1} to \nth{3} deciles are classified by the quote rule. Counted from the bid, the \nth{1} decile starts at $b_{i,t}$ and ends at $b_{i,t} + \tfrac{3}{10} (a_{i,t} - b_{i,t}) = \tfrac{7}{10} b_{i,t} + \tfrac{3}{10} a_{i,t}$ \nth{3} decile. As all trade prices are below the midpoint, they are classified as a sell.}. The classical \cgls{CLNV} method uses the backward-looking tick rule. In the spirit of \textcite[][735]{leeInferringTradeDirection1991}, the tick test could be exchanged for the reverse tick test.
+% TODO: sucess rates are sensitive to trade location.
+
+The algorithm is summarized in \cref{eq:CLNV-rule}. It is derived from a performance comparison of the tick rule (\cgls{EMO} rule) against the quote rule (\cgls{LR} algorithm) on stock data, whereby the accuracy was assessed separately for each decile \footnote{The spread is assumed to be positive and evenly divided into ten deciles and the \nth{1} to \nth{3} deciles are classified by the quote rule. Counted from the bid, the \nth{1} decile starts at $B_{i,t}$ and ends at $B_{i,t} + \tfrac{3}{10} (A_{i,t} - B_{i,t}) = \tfrac{7}{10} B_{i,t} + \tfrac{3}{10} A_{i,t}$ \nth{3} decile. As all trade prices are below the midpoint, they are classified as a sell.}. The classical \cgls{CLNV} method uses the backward-looking tick rule. In the spirit of \textcite[][735]{leeInferringTradeDirection1991}, the tick test could be exchanged for the reverse tick test.
 
 \subsubsection{Stacked Rule}\label{sec:stacked-rule}
 
@@ -274,7 +284,7 @@ \subsubsection{Decision Tree}\label{sec:decision-tree}
 \min _{j, s}\left[\min _{c_1} \sum_{x_i \in R_1(j, s)}\left(y_i-c_1\right)^2+\min _{c_2} \sum_{x_i \in R_2(j, s)}\left(y_i-c_2\right)^2\right].
 \end{equation}
 
-The procedure is repeated on the so-created child nodes. Note that, splits are performed greedily to keep computations tractable. This entails, that only the reduction in \gls{SSE} of the current node is considered, and not the improvement from any subsequent splits in the child nodes. Computational costs may still be high, when there are many split candidates, due to a large feature count or possible split values. Common approximations are to split on quantized features or random feature subsets.
+The procedure is repeated on the so-created child nodes. Note that, splits are performed greedily to keep computations tractable. This entails, that only the reduction in \gls{SSE} of the current node is considered, and not the improvement from any subsequent splits in the child nodes. Computational costs may still be high, when there are many split candidates, due to a large feature count or possible split values. Common approximations are to2222 split on quantized features or random feature subsets.
 % TODO add citations where quantization is used. See ke (gradient boosting paper)
 
 Trivially, growing deeper trees leads to an improvement in the \gls{SSE}. Considering the extreme, where each sample has its region, the tree would achieve a perfect fit in-sample but perform poorly on out-of-sample data. To reduce the sensitivity of the tree to changes in the training data, hence \emph{variance}, size complexity pruning procedures are employed. Likewise, if the decision tree is too simplistic, a high bias contributes to the model's overall expected error. Both extremes are to be avoided.
@@ -369,13 +379,13 @@ \subsubsection{Position-wise Feed-Forward Networks}\label{sec:position-wise-ffn}
 
 \textcite[][9]{vaswaniAttentionAllYou2017} set the hidden dimension to be two to eight magnitudes of the embedding dimension. The large capacity strengthens the model's ability to retain information but also contributes significantly to the high computational requirements and memory footprint of Transformers \autocites[][5]{tayEfficientTransformersSurvey2022}[][1]{kitaevReformerEfficientTransformer2020}. Both linear transformations are separated by a \gls{ReLU} \gls{activation-function} \autocite[][318]{glorotDeepSparseRectifier2011} to introduce non-linearities to the network.
 
-Like the attention layer, the position-wise \gls{FFN} is surrounded by residual connections, followed by layer normalization (see \cref{sec:residual-connections-layer-norm}). Both are vital for the training process and convergence of the overall network, as we show. Optionally, dropout \autocite[][1930]{srivastavaDropoutSimpleWay} is added to prevent the model from \gls{overfitting}.
+Like the attention layer, the position-wise \gls{FFN} is surrounded by residual connections, followed by layer normalization (cp. \cref{sec:residual-connections-layer-norm}). Both are vital for the training process and convergence of the overall network, as we show. Optionally, dropout \autocite[][1930]{srivastavaDropoutSimpleWay} is added to prevent the model from \gls{overfitting}.
 
 \subsubsection{Residual Connections and Layer Normalization}\label{sec:residual-connections-layer-norm}
 
 Recall from earlier chapters, that the encoder stacks multiple Transformer blocks, each of which consists of several sublayers, resulting in a deep network. While depth is inevitable to learn hierarchical representations, the training of such a network is complicated. As neural networks are commonly trained using backpropagation, which relies on the gradient of the error to be propagated through the network starting at the last layer, vanishing or \glspl{exploding-gradient} pose a major difficulty in training deep neural nets \autocite[][1]{heDeepResidualLearning2015}. Without countermeasures, stacking multiple layers in the encoder and decoder of the Transformers impedes the gradient information to flow efficiently through the network and hampers the training behaviour \autocite[][1811]{wangLearningDeepTransformer2019}.
 
-As a remedy, \textcite[][3]{vaswaniAttentionAllYou2017} employ residual connections around each sublayer, whereby the output of the sublayer is added element-wisely to its input. Intuitively, the residual connection provides an alternative path for information to flow through the network, since some information can bypass the sublayer and thereby reach deeper layers within the stack. Also, vanishing or \glspl{exploding-gradient} are mitigated, as gradients can bypass the sublayer, eventually contributing towards an easier optimization \autocite[][3591]{liuRethinkingSkipConnection2020}. Residual connections moreover help to preserve the positional embeddings (see \cref{sec:positional-encoding}), as the layer's inputs are maintained in the identity mapping. Another technique to improve the training behaviour is layer normalization.
+As a remedy, \textcite[][3]{vaswaniAttentionAllYou2017} employ residual connections around each sublayer, whereby the output of the sublayer is added element-wisely to its input. Intuitively, the residual connection provides an alternative path for information to flow through the network, since some information can bypass the sublayer and thereby reach deeper layers within the stack. Also, vanishing or \glspl{exploding-gradient} are mitigated, as gradients can bypass the sublayer, eventually contributing towards an easier optimization \autocite[][3591]{liuRethinkingSkipConnection2020}. Residual connections moreover help to preserve the positional embeddings (cp. \cref{sec:positional-encoding}), as the layer's inputs are maintained in the identity mapping. Another technique to improve the training behaviour is layer normalization.
 
 \textcite[][3]{vaswaniAttentionAllYou2017} extensively draw on layer normalization \autocite[][4]{baLayerNormalization2016} after the multi-headed attention and feed-forward sublayers. It is used for normalizing the activations of the sublayer and to stabilize and accelerate the training of the network \autocite[][2]{baLayerNormalization2016}. For the Transformer, the normalization statistics are calculated separately for every instance, which guarantees scalability across different batch sizes. For a vector $\gls{e}\in \mathbb{R}^{d_e}$ the normalized output is given by
 \begin{equation}
@@ -465,7 +475,7 @@ \subsubsection{FT-Transformer}\label{sec:fttransformer}
 \end{align}
 \end{subequations}
 
-Recall from our discussion on self-attention (see \cref{sec:attention}), that each token encodes the tokens within the sequence. Based on this notion, \textcite[][4174]{devlinBERTPretrainingDeep2019} prepend a specialized $\texttt{[CLS]}$ token to the sequence, which stores the sequence's aggregate representation. Like any other token, the $\texttt{[CLS]}$ token is embedded first and contextualized in the encoder. Its final hidden state is then used for classification. \textcite[][4]{gorishniyRevisitingDeepLearning2021} adapt the idea of a $\texttt{[CLS]}$ token for tabular representation models. Similar to the embeddings of categorical or continuous features, the embedding of the $[\texttt{CLS}]$ token $\gls{e}_\texttt{[CLS]} \in \mathbb{R}^{d_{e}}$ is prepended to the column embeddings with $\gls{X} = \left[\gls{e}_\texttt{[CLS]}, \gls{e}_1, \gls{e}_2, \ldots \gls{e}_{n}\right]$, where $\gls{X} \in \mathbb{R}^{d_{e} \times n +1}$. Like before, $\gls{X}$ is passed through a stack of Transformer layers. The updated representation of the $\texttt{[CLS]}$ token is used exclusively for prediction:
+Recall from our discussion on self-attention (cp. \cref{sec:attention}), that each token encodes the tokens within the sequence. Based on this notion, \textcite[][4174]{devlinBERTPretrainingDeep2019} prepend a specialized $\texttt{[CLS]}$ token to the sequence, which stores the sequence's aggregate representation. Like any other token, the $\texttt{[CLS]}$ token is embedded first and contextualized in the encoder. Its final hidden state is then used for classification. \textcite[][4]{gorishniyRevisitingDeepLearning2021} adapt the idea of a $\texttt{[CLS]}$ token for tabular representation models. Similar to the embeddings of categorical or continuous features, the embedding of the $[\texttt{CLS}]$ token $\gls{e}_\texttt{[CLS]} \in \mathbb{R}^{d_{e}}$ is prepended to the column embeddings with $\gls{X} = \left[\gls{e}_\texttt{[CLS]}, \gls{e}_1, \gls{e}_2, \ldots \gls{e}_{n}\right]$, where $\gls{X} \in \mathbb{R}^{d_{e} \times n +1}$. Like before, $\gls{X}$ is passed through a stack of Transformer layers. The updated representation of the $\texttt{[CLS]}$ token is used exclusively for prediction:
 \begin{equation}
 P=\texttt{linear}\left(\texttt{ReLU}\left(\texttt{layer\_norm}\left(\gls{X}\left[:,0\right]\right)\right)\right).
 \label{eq:bert-ft}
@@ -506,12 +516,12 @@ \subsubsection{Feature Engineering (1.5~p)}\label{sec:feature-engineering}
 
 \subsubsection{Train-Test Split (0.5~p)}\label{sec:train-test-split}
 
-\begin{figure}[ht]
-    \centering
-    \includegraphics{train-test-split.pdf}
-    \caption[Training Schemes]{Training Schemes. Own work.}
-    \label{fig:train-test-split}
-\end{figure}
+% \begin{figure}[ht]
+%     \centering
+%     \includegraphics{train-test-split.pdf}
+%     \caption[Training Schemes]{Training Schemes. Own work.}
+%     \label{fig:train-test-split}
+% \end{figure}
 
 \subsection{Training and Tuning (10~p)}\label{sec:training-and-tuning}
 
@@ -535,7 +545,7 @@ \subsubsection{Feature Importance
 \textbf{Attention Maps}
 
 In addition to random feature permutation, Transformer-based models offer \emph{some} interpretability through their attention mechanism~\footnote{One has to distinguish interpretability through \emph{explainability} from \emph{transparency} \autocite[][4--5]{liptonMythosModelInterpretability2017}. In recent research a major controversy embarked around the question, of whether attention offers explanations to model predictions \autocites[cp.][150]{bastingsElephantInterpretabilityRoom2020}[][5--7]{jainAttentionNotExplanation2019}[][9]{wiegreffeAttentionNotNot2019}. The debate sparked around opposing definitions of explainability and the consistency of attention scores with other, established feature-importance measures. Our focus is less on post-hoc explainability of the model, but rather on transparency. Consistent with \textcite[][8]{wiegreffeAttentionNotNot2019} we view attention scores as a vehicle to model transparency.
-}. Recall from our discussion on attention (see \cref{sec:attention}) that the attention matrix stores how much attention a token pays to each of the keys. Thus, feature attributions can be derived from attention by visualizing features that the model is paying attention to in an attention map. While attention maps are specific to Transformers or other attention-based architectures, rendering them useless for cross-model comparisons, they give additional insights from different attention layers and attention heads of the model on a per-trade and global basis. An example is shown in \cref{fig:attention-maps}.
+}. Recall from our discussion on attention (cp. \cref{sec:attention}) that the attention matrix stores how much attention a token pays to each of the keys. Thus, feature attributions can be derived from attention by visualizing features that the model is paying attention to in an attention map. While attention maps are specific to Transformers or other attention-based architectures, rendering them useless for cross-model comparisons, they give additional insights from different attention layers and attention heads of the model on a per-trade and global basis. An example is shown in \cref{fig:attention-maps}.
 
 \begin{figure}[ht]
 \centering
@@ -569,7 +579,7 @@ \subsubsection{Feature Importance
 
 In this approach, the element-wise product between the gradient of the attention map $\nabla \boldsymbol{A}^{(l)}=\frac{\partial y_t}{\partial \boldsymbol{A}}$ for the model's target class $t$ and the attention map $\boldsymbol{A}^{(l)}$ is calculated to weight the attention head's importance. As previously suggested in \textcite[][786]{cheferTransformerInterpretabilityAttention2021}, negative contributions are eliminated to focus on the positive relevance, and the results are averaged over the heads dimension. Like all other presented approaches \cref{eq:attention-map-rollout,eq:attention-map-weighted} can be computed with a single forward pass and is therefore computationally efficient.
 
-In absence of ground truth for the true feature attribution, we resort to attention maps using \cref{eq:attention-map-weighted}. Following prior research, feature attributions are also summed over the first attention layer or all transformer blocks. Due to the limitation that TabTransformer (see \cref{sec:tabtransformer}) only performs self-attention on categorical features, no feature attributions for numerical features are calculated. The level of agreement between attributions from attention maps and kernel \gls{SHAP} is quantified by calculating Spearman's rank correlation between them.
+In absence of ground truth for the true feature attribution, we resort to attention maps using \cref{eq:attention-map-weighted}. Following prior research, feature attributions are also summed over the first attention layer or all transformer blocks. Due to the limitation that TabTransformer (cp. \cref{sec:tabtransformer}) only performs self-attention on categorical features, no feature attributions for numerical features are calculated. The level of agreement between attributions from attention maps and kernel \gls{SHAP} is quantified by calculating Spearman's rank correlation between them.
 
 The next chapter discusses different metrics to assess the prediction quality of our models.
 
@@ -591,9 +601,38 @@ \subsection{Feature Importance (3~p)}\label{sec:feature-importance}
 \subsection{Ablation Study of Models (2~p)}\label{sec:ablation-study}
 
 \newpage
-\section{Application in Transaction Cost Estimation (optional)}\label{sec:application}
-\subsection{Simulation Setup (optional)}\label{sec:simulation-setup}
-\subsection{Simulation Results (optional)}\label{sec:simulation-results}
+\section{Application in Transaction Cost Estimation}\label{sec:application}
+\subsection{Simulation Setup}\label{sec:simulation-setup}
+
+Albeit the classification accuracy is a reasonable measure for comparing classifiers, one cannot immediately infer how changes in accuracy e.~g., an improvement by \SI{1}{\percent}, affect the application domains. In an attempt to make our results tangible, we apply all algorithms to estimate trading cost, a problem we previously identified to be reliant on correct trade classification (cp. \cref{sec:introduction}) and a common testing ground for trade classification rules \autocites[cp.][541]{ellisAccuracyTradeClassification2000}[][569]{finucaneDirectTestMethods2000}[][271--278]{petersonEvaluationBiasesExecution2003}[][896--897]{savickasInferringDirectionOption2003}.
+
+One of the most widely adopted measures for trading costs is the effective spread \autocite[][112]{Piwowar_2006}. It is defined as the difference between the trade price and the fundamental value of the asset \autocite[][238--239]{bessembinderIssuesAssessingTrade2003}. Following \textcite[][238--239]{bessembinderIssuesAssessingTrade2003}, we define the \emph{nominal, effective spread} as
+\begin{equation}
+    S_{i,t} = 2 (P_{i,t} - V_{i,t}) D_{i,t}.
+    \label{eq:effective-spread}
+\end{equation}
+
+Like before, $i$ indexes the security and $t$ denotes the trade. Here, $D_{i,t}$ is the trade direction, which is either $1$ for customer buy orders and $-1$ for customer sell orders. If the trade initiator is known, we set $D_{i,t} = y_{i,t}$ and $D_{i,t}=\hat{y}_{it}$, if inferred from a rule or classifier. As the fundamental value $V_{i,t}$ is unobserved at the time of the trade, we follow a common track in research and use the midpoint of the prevailing quotes as an observable proxy \footnote{An alternative treatment for options is discussed in \textcite[][4975--4976]{muravyevOptionsTradingCosts2020}. Our focus is on the midspread, as it is the most common proxy for the value.}. This is also a natural choice, assuming that, on average, the spread is symmetrical and centred around the true fundamental value \autocite[][1018]{leeMarketIntegrationPrice1993}. We multiply the so-obtained half-spread by $2$ to obtain the effective spread, which represents the cost for a round trip trade involving a buy and sell ex commissions.
+
+Readily apparent from \cref{eq:effective-spread}, poor estimates for the predicted trade direction, lead to an under or over-estimated effective spread, and hence to a skewed trade cost estimate. By comparing the true effective spread from the estimated, we can derive the economic significance. For convenience, we also calculate the \emph{relative effective spread} as
+\begin{equation}
+    {PS}_{i,t} = S_{i,t} / V_{i,t}.
+\end{equation}
+% TODO check how it is defined Savickas / Finucane use midpoint, Peterson and Sirri divide by price?
+The subsequent section estimates both the nominal and relative effective spread for our test sets.
+
+\subsection{Simulation Results}\label{sec:simulation-results}
+
+The actual and the estimated effective spreads, as well as the quoted spread, are shown in the \cref{tab:effective-spread} aggregated by mean. \textcite[][896--897]{savickasInferringDirectionOption2003} estimated the effective spreads on a subset of rules for option trades at the \gls{CBOE}, which can be compared against.
+
+\begin{table}[H]
+    \centering
+    \input{Content/effective-spread.tex}
+    \caption[Estimated Effective Spread]{Estimated Effective Spread}
+    \label{tab:effective-spread}
+\end{table}
+
+A $t$-test is used to test if the estimated, effective spread is significantly different from the mean true effective spread / significantly greater than zero at $p=0.01$ \autocite[cp.][570]{finucaneDirectTestMethods2000}. Alternatively compare correlations $\rho$ and medians using the Wilcoxon test with the null hypothesis of the equal medians with $p=0.01$ \autocite[cp.][12]{theissenTestAccuracyLee2000}.
 
 \newpage
 \section{Discussion (3~p)}\label{sec:discussion}
diff --git a/reports/thesis.tex b/reports/thesis.tex
index 17148511..e326285a 100644
--- a/reports/thesis.tex
+++ b/reports/thesis.tex
@@ -134,23 +134,23 @@
 % see https://tex.stackexchange.com/questions/347586/symbols-as-glossary-entry-indicators
 %Symbols
 \newglossaryentry{V}{type=symbols,name={\ensuremath{\cong[N_\t{V}]}},sort=V, description={vocabulary}}
-\newglossaryentry{a}{type=symbols,name={\ensuremath{a}},sort=a, description={sequence of ask prices}, unit={\ensuremath{=\langle a_{1},a_{2},\dots,a_{T}\rangle}}}
-\newglossaryentry{b}{type=symbols,name={\ensuremath{b}},sort=b, description={sequence of bid prices}, unit={\ensuremath{=\langle b_{1},b_{2},\dots,b_{T}\rangle}}}
-\newglossaryentry{a-tilde}{type=symbols,name={\ensuremath{\tilde{a}}},sort=a-tilde, description={sequence of ask sizes}, unit={\ensuremath{=\langle \tilde{a}_{1},\tilde{a}_{2},\dots,\tilde{a}_{T}\rangle}}}
-\newglossaryentry{b-tilde}{type=symbols,name={\ensuremath{\tilde{b}}},sort=b-tilde, description={sequence of bid sizes}, unit={\ensuremath{=\langle \tilde{b}_{1},\tilde{b}_{2},\dots,\tilde{b}_{T}\rangle}}}
+\newglossaryentry{A}{type=symbols,name={\ensuremath{A}},sort=A, description={sequence of ask prices}, unit={\ensuremath{=\langle A_{1},A_{2},\dots,A_{T}\rangle}}}
+\newglossaryentry{B}{type=symbols,name={\ensuremath{B}},sort=B, description={sequence of bid prices}, unit={\ensuremath{=\langle B_{1},B_{2},\dots,B_{T}\rangle}}}
+\newglossaryentry{A-tilde}{type=symbols,name={\ensuremath{\tilde{A}}},sort=A-tilde, description={sequence of ask sizes}, unit={\ensuremath{=\langle \tilde{A}_{1},\tilde{A}_{2},\dots,\tilde{A}_{T}\rangle}}}
+\newglossaryentry{B-tilde}{type=symbols,name={\ensuremath{\tilde{B}}},sort=B-tilde, description={sequence of bid sizes}, unit={\ensuremath{=\langle \tilde{B}_{1},\tilde{B}_{2},\dots,\tilde{B}_{T}\rangle}}}
 \newglossaryentry{ell}{type=symbols,name={\ensuremath{d_e}}, sort=ell, description={length of token sequence}, unit={\ensuremath{\in \mathbb{N}}}}
 \newglossaryentry{e}{type=symbols,name={\ensuremath{\boldsymbol{e}}}, sort=e, description={vector representation / embedding of a token}, unit={\ensuremath{\in \mathbb{R}^{d_e}}}}
 \newglossaryentry{d}{type=symbols,name={\ensuremath{d}}, sort=d, description={dimension of a vector}, unit={\ensuremath{\in \mathbb{N}}}}
 \newglossaryentry{ellmax}{type=symbols,name={\ensuremath{\ell_{\max}}},sort=ell-max, description={maximum sequence length}, unit={\ensuremath{\in \mathbb{N}}}}
 \newglossaryentry{h}{type=symbols,name={\ensuremath{h}},sort=h, description={index of attention heads}}
 \newglossaryentry{H}{type=symbols,name={\ensuremath{H}},sort=H, description={number of attention heads}}
-\newglossaryentry{p}{type=symbols,name={\ensuremath{p}},sort=p, description={sequence of trade prices}, unit={\ensuremath{=\langle p_{1},p_{2},\dots,p_{T}\rangle}}}
-\newglossaryentry{p-tilde}{type=symbols,name={\ensuremath{\tilde{p}}},sort=p-tilde, description={sequence of trade sizes}, unit={\ensuremath{=\langle \tilde{p}_{1},\tilde{p}_{2},\dots,\tilde{p}_{T}\rangle}}}
+\newglossaryentry{P}{type=symbols,name={\ensuremath{P}},sort=p, description={sequence of trade prices}, unit={\ensuremath{=\langle P_{1},P_{2},\dots,P_{T}\rangle}}}
+\newglossaryentry{P-tilde}{type=symbols,name={\ensuremath{\tilde{P}}},sort=P-tilde, description={sequence of trade sizes}, unit={\ensuremath{=\langle \tilde{P}_{1},\tilde{P}_{2},\dots,\tilde{P}_{T}\rangle}}}
 \newglossaryentry{L}{type=symbols,name={\ensuremath{L}},sort=L, description={number of layers in the encoder / decoder}, unit={\ensuremath{\in \mathbb{N}}}}
-\newglossaryentry{m}{type=symbols,name={\ensuremath{m}},sort=m, description={sequence of spread midpoints}, unit={\ensuremath{\langle m_{1},m_{2},\dots,m_{T}\rangle}}}
+\newglossaryentry{M}{type=symbols,name={\ensuremath{M}},sort=m, description={sequence of spread midpoints}, unit={\ensuremath{\langle m_{1},m_{2},\dots,m_{T}\rangle}}}
 \newglossaryentry{t}{type=symbols,name={\ensuremath{t}},sort=t, description={index of token in sequence},unit={\ensuremath{\in\left[\ell_{\max }\right]}}}
 \newglossaryentry{x}{type=symbols,name={\ensuremath{x}},sort=x, description={primary token sequence}, unit={\ensuremath{\equiv x[1] x[2] \ldots x[\ell] \in V^{\ell}}}}
-\newglossaryentry{y}{type=symbols,name={\ensuremath{y}},sort=y, description={trade initiator / target}, unit={\ensuremath{\in \{0,1\}}}}
+\newglossaryentry{y}{type=symbols,name={\ensuremath{y}},sort=y, description={trade initiator / target}, unit={\ensuremath{\in \{-1,1\}}}}
 \newglossaryentry{X}{type=symbols,name={\ensuremath{\boldsymbol{X}}},sort=X, description={encoded primary token sequence }, unit={\ensuremath{\in \mathbb{R}^{d_e \times \ell_x}}}}
 
 \newglossarystyle{dotglos}{%
diff --git a/src/otc/metrics/__init__.py b/src/otc/metrics/__init__.py
new file mode 100644
index 00000000..979dacf3
--- /dev/null
+++ b/src/otc/metrics/__init__.py
@@ -0,0 +1,5 @@
+"""
+Support for metrics.
+
+See `readme.md` for instructions on how to run.
+"""
diff --git a/src/otc/metrics/metrics.py b/src/otc/metrics/metrics.py
new file mode 100644
index 00000000..e8effa94
--- /dev/null
+++ b/src/otc/metrics/metrics.py
@@ -0,0 +1,28 @@
+"""
+Sklearn implementation of effective spread.
+
+See: https://hagstromer.org/2020/11/23/overestimated-effective-spreads/ for explanation.
+"""
+import numpy as np
+import numpy.typing as npt
+from sklearn.utils import check_consistent_length
+
+
+def effective_spread(
+    y_pred: npt.NDArray, trade_price: npt.NDArray, fundamental_value: npt.NDArray
+) -> np.float64:
+    """
+    Calculate the effective spread given by:
+    $$
+    S_{i,t} = 2 (P_{i,t} - V_{i,t}) D_{i,t}
+    $$
+
+    Args:
+        y_pred (npt.NDArray): indicator if the trade is a buy or sell
+        trade_price (npt.NDArray): trade price
+        fundamental_value (npt.NDArray): fundamental value e. g., bid-ask midpoint.
+    Returns:
+        float: average effective spread
+    """
+    check_consistent_length(y_pred, trade_price, fundamental_value)
+    return np.mean(2 * (trade_price - fundamental_value) * y_pred)
diff --git a/src/otc/preprocessing/scaler.py b/src/otc/preprocessing/scaler.py
deleted file mode 100644
index 46d4a96b..00000000
--- a/src/otc/preprocessing/scaler.py
+++ /dev/null
@@ -1,67 +0,0 @@
-"""
-PyTorch implementation of z-standardization.
-
-See: https://en.wikipedia.org/w/index.php?title=Feature_scaling
-"""
-
-import torch
-
-
-class TorchStandardScaler:
-    """
-    Performs z-scaling.
-
-    Fit on training set. Transorm training set, validation set, and test
-    set with training set mean and std deviation.
-    """
-
-    def __init__(self) -> None:
-        """
-        z-scaler.
-
-        See: https://en.wikipedia.org/w/index.php?title=Feature_scaling
-        """
-        self._mean: torch.Tensor
-        self._std: torch.Tensor
-        self._threshold = 1e-7
-
-    def fit(self, x: torch.Tensor) -> None:
-        """
-        Calculate mean and std deviation of input tensor.
-
-        Args:
-            x (torch.Tensor): input tensor.
-        """
-        self._mean = x.mean(0, keepdim=True)
-        self._std = x.std(0, unbiased=False, keepdim=True)
-
-    def transform(self, x: torch.Tensor) -> torch.Tensor:
-        """
-        Apply z-scaling on input tensor.
-
-        Args:
-            x (torch.Tensor): input tensor.
-
-        Returns:
-            torch.Tensor: z-standardized tensor.
-        """
-        x -= self._mean
-        # avoid division by zero through small const
-        # scikit-learn does it differently by detecting near
-        # constant features. See: https://bit.ly/3tYVWnW
-        x /= self._std + self._threshold
-        return x
-
-    def inverse_transform(self, x: torch.Tensor) -> torch.Tensor:
-        """
-        Inverse z-scaling.
-
-        Args:
-            x (torch.Tensor): input tensor.
-
-        Returns:
-            torch.Tensor: unscaled output tensor.
-        """
-        x *= self._std + self._threshold
-        x += self._mean
-        return x
diff --git a/tests/test_metrics.py b/tests/test_metrics.py
new file mode 100644
index 00000000..10c59b8a
--- /dev/null
+++ b/tests/test_metrics.py
@@ -0,0 +1,33 @@
+"""
+Tests for Metrics.
+"""
+
+import numpy as np
+
+from otc.metrics.metrics import effective_spread
+from otc.models.objective import set_seed
+
+
+class TestMetrics:
+    """
+    Perform automated tests for objectives.
+
+    Args:
+        metaclass (_type_, optional): parent. Defaults to abc.ABCMeta.
+    """
+
+    def test_effective_spread(self) -> None:
+        """
+        Test if effective spread returns a valid value.
+
+        Value may not be NaN.
+        """
+        set_seed(7)
+
+        y_pred = np.random.choice([-1, 1], size=(10))
+        trade_price = np.random.rand(10) * 100
+        fundamental_value = np.random.rand(10) * 100
+
+        e_s = effective_spread(y_pred, trade_price, fundamental_value)
+
+        assert np.isclose(e_s, 0.86, atol=1e-02, equal_nan=False)