diff --git a/references/obsidian/.obsidian/workspace.json b/references/obsidian/.obsidian/workspace.json
index 0a758fa7..17abfdb3 100644
--- a/references/obsidian/.obsidian/workspace.json
+++ b/references/obsidian/.obsidian/workspace.json
@@ -6,7 +6,7 @@
       {
         "id": "ac0c9ffcaad5ed9e",
         "type": "tabs",
-        "dimension": 59.76505139500734,
+        "dimension": 64.2504118616145,
         "children": [
           {
             "id": "160a03ac0eb0e817",
@@ -23,13 +23,13 @@
             }
           },
           {
-            "id": "0f92442fd597a142",
+            "id": "5fc6bddb531253ce",
             "type": "leaf",
             "state": {
               "type": "markdown",
               "state": {
-                "file": "chapters/🤖FTTransformer.md",
-                "mode": "source",
+                "file": "chapters/🤖TabTransformer.md",
+                "mode": "preview",
                 "source": false
               }
             }
@@ -52,7 +52,7 @@
       {
         "id": "675af78723ee45d1",
         "type": "tabs",
-        "dimension": 40.23494860499266,
+        "dimension": 35.749588138385505,
         "children": [
           {
             "id": "0abd77888f93b540",
@@ -78,7 +78,19 @@
             "state": {
               "type": "markdown",
               "state": {
-                "file": "chapters/🤖TabTransformer.md",
+                "file": "🧠Deep Learning Methods/Transformer/@huangTabTransformerTabularData2020.md",
+                "mode": "source",
+                "source": false
+              }
+            }
+          },
+          {
+            "id": "27942e948feec381",
+            "type": "leaf",
+            "state": {
+              "type": "markdown",
+              "state": {
+                "file": "chapters/🧵Positional encoding.md",
                 "mode": "source",
                 "source": false
               }
@@ -102,13 +114,14 @@
             "state": {
               "type": "markdown",
               "state": {
-                "file": "🧠Deep Learning Methods/Transformer/@huangTabTransformerTabularData2020.md",
-                "mode": "source",
+                "file": "chapters/🤖FTTransformer.md",
+                "mode": "preview",
                 "source": false
               }
             }
           }
-        ]
+        ],
+        "currentTab": 4
       }
     ],
     "direction": "vertical"
@@ -166,7 +179,7 @@
             "state": {
               "type": "backlink",
               "state": {
-                "file": "chapters/🤖FTTransformer.md",
+                "file": "chapters/🤖TabTransformer.md",
                 "collapseAll": false,
                 "extraContext": false,
                 "sortOrder": "alphabetical",
@@ -194,7 +207,7 @@
             "state": {
               "type": "outline",
               "state": {
-                "file": "chapters/🤖FTTransformer.md"
+                "file": "chapters/🤖TabTransformer.md"
               }
             }
           }
@@ -215,17 +228,17 @@
       "markdown-importer:Open format converter": false
     }
   },
-  "active": "0f92442fd597a142",
+  "active": "5fc6bddb531253ce",
   "lastOpenFiles": [
-    "🧠Deep Learning Methods/@gorishniyRevisitingDeepLearning2021.md",
     "chapters/🤖FTTransformer.md",
-    "chapters/🤖Extensions to TabTransformer.md",
-    "chapters/🤖Pretraining FTTransformer.md",
-    "🧠Deep Learning Methods/Transformer/@gorishniyEmbeddingsNumericalFeatures2022.md",
-    "TOC.md",
-    "Semi-supervised Learning/@devlinBERTPretrainingDeep2019.md",
+    "chapters/🤖TabTransformer.md",
+    "🧠Deep Learning Methods/Transformer/@huangTabTransformerTabularData2020.md",
+    "chapters/🧵Positional encoding.md",
     "chapters/🤖Training of the Transformer.md",
-    "chapters/🛌Token Embedding.md",
-    "chapters/🧭Attention map.md"
+    "chapters/💡Training and tuning.md",
+    "chapters/🤖Transformer.md",
+    "🎨transformer.canvas",
+    "chapters/🤖Pretraining FTTransformer.md",
+    "🧠Deep Learning Methods/@gorishniyRevisitingDeepLearning2021.md"
   ]
 }
\ No newline at end of file
diff --git "a/references/obsidian/chapters/\360\237\222\241Training and tuning.md" "b/references/obsidian/chapters/\360\237\222\241Training and tuning.md"
index 1c7876a3..f45df079 100644
--- "a/references/obsidian/chapters/\360\237\222\241Training and tuning.md"	
+++ "b/references/obsidian/chapters/\360\237\222\241Training and tuning.md"	
@@ -1,6 +1,6 @@
+- training of the transformer has been found non-trivial[[@liuUnderstandingDifficultyTraining2020]]
 - Do less alchemy and more understanding [Ali Rahimi's talk at NIPS(NIPS 2017 Test-of-time award presentation) - YouTube](https://www.youtube.com/watch?v=Qi1Yry33TQE)
 - Keep algorithms / ideas simple. Add complexity only where needed! 
-- Do rigorous testing.
 - Don't chase the benchmark, but aim for explainability of the results.
 - compare against https://github.com/jktis/Trade-Classification-Algorithms
 - Classical rules could be implemented using https://github.com/jktis/Trade-Classification-Algorithms
diff --git "a/references/obsidian/chapters/\360\237\244\226FTTransformer.md" "b/references/obsidian/chapters/\360\237\244\226FTTransformer.md"
index 2e179cde..1b3f985d 100644
--- "a/references/obsidian/chapters/\360\237\244\226FTTransformer.md"
+++ "b/references/obsidian/chapters/\360\237\244\226FTTransformer.md"
@@ -5,7 +5,7 @@ The FTTransformer of [[@gorishniyRevisitingDeepLearning2021]] is an adaption of
 
 The *feature tokenizer* transforms all features of $x$ to their embeddings. If the $j$-th feature, $x_j$, is **numerical**, it is projected to its embedding $e_j \in \mathbb{R}^{e_d}$ by element-wise multiplication with a learned vector $W_j \in \mathbb{R}^{e_d}$ and the addition of a feature-dependent bias term $b_j \in \mathbb{R}$, as in Equation (1).
 
-For **categorical** inputs, the embedding is implemented as a lookup table, similar to the techniques from Chapter [[🛌Token Embedding]] and [[🤖TabTransformer]]. We denote the cardinality of the $j$-th feature with $N_{C_j}$. The specific embeddings $e_j$ are queried with a unique integer key $c_j \in C_j \cong\left[N_{\mathrm{C_j}}\right]$ from the learned embedding matrix $W_j \in \mathbb{R}^{e_d \times N_{C_j}}$. Finally a feature-specific bias term $b_j$ is added <mark style="background: #FFB86CA6;">(TODO: lookup if bias is a scalar or vector?).</mark>  Similar to the [[🛌Token Embedding]], a previous label encoding (or a similar technique) must be employed, to map the categories to their unique integer keys. Overall:
+For **categorical** inputs, the embedding is implemented as a lookup table, similar to the techniques from Chapter [[🛌Token Embedding]] and [[🤖TabTransformer]]. We denote the cardinality of the $j$-th feature with $N_{C_j}$. The specific embeddings $e_j$ are queried with a unique integer key $c_j \in C_j \cong\left[N_{\mathrm{C_j}}\right]$ from the learned embedding matrix $W_j \in \mathbb{R}^{e_d \times N_{C_j}}$. Finally a feature-specific bias term $b_j$ is added <mark style="background: #FFB86CA6;">(TODO: lookup if bias is a scalar or vector?).</mark>  Overall for $x_j$:
 %%
 Exemplary, the encoding of the option type could be  $\text{P}\mapsto 1$; $\text{C}\mapsto 2$, which would result in a selection of the second column of the embedding matrix whenever a put is traded. 
 %%
diff --git "a/references/obsidian/chapters/\360\237\244\226TabTransformer.md" "b/references/obsidian/chapters/\360\237\244\226TabTransformer.md"
index e4aa5361..c63fda02 100644
--- "a/references/obsidian/chapters/\360\237\244\226TabTransformer.md"
+++ "b/references/obsidian/chapters/\360\237\244\226TabTransformer.md"
@@ -4,10 +4,22 @@
 ![[tab_transformer.png]]
 (own drawing. Inspired by [[@huangTabTransformerTabularData2020]]. Top layers a little bit different. They write MLP. I take the FFN with two hidden layers and an output layer. <mark style="background: #FFB8EBA6;">Better change label to MLP</mark>; Also they call the <mark style="background: #FFB8EBA6;">input embedding a column embedding, use L instead of N)</mark> ^87bba0
 
-Motivated by the success of (cp. [[@devlinBERTPretrainingDeep2019]]; [[@liuRoBERTaRobustlyOptimized2019]]) of contextual embeddings in natural language processing, [[@huangTabTransformerTabularData2020]]  propose with *TabTransformer* an adaption of the classical Transformer for the tabular domain. *TabTransformer* is *encoder-only* and features a stack of Transformer layers (see chapter [[🤖Transformer]] or [[@vaswaniAttentionAllYou2017]]) to learn contextualized embeddings of categorical features from their parametric embeddings, as shown in Figure ([[#^87bba0]]]).  The transformer layers, are identical to those found in [[@vaswaniAttentionAllYou2017]] featuring multi-headed self-attention and a norm-last layer arrangement. Continuous inputs are normalized using layer norm ([[@baLayerNormalization2016]]) , concatenated with the contextual embeddings, and input into a multi-layer peceptron. More specifically, [[@huangTabTransformerTabularData2020]] (p. 4; 12) use a feed-forward network with two hidden layers, whilst other architectures and even non-deep models, such as [[🐈gradient-boosting]], are applicable.<mark style="background: #FFB8EBA6;"> (downstream network?)</mark> Thus, for strictly continuous inputs, the network collapses to a multi-layer perceptron with layer normalization.
+Motivated by the success of (cp. [[@devlinBERTPretrainingDeep2019]]; [[@liuRoBERTaRobustlyOptimized2019]]) of contextual embeddings in natural language processing, [[@huangTabTransformerTabularData2020]]  propose with *TabTransformer* an adaption of the classical Transformer for the tabular domain. *TabTransformer* is *encoder-only* and features a stack of Transformer layers (see chapter [[🤖Transformer]] or [[@vaswaniAttentionAllYou2017]]) to learn contextualized embeddings of categorical features from their parametric embeddings, as shown in Figure ([[#^87bba0]]]).  The transformer layers, are identical to those found in [[@vaswaniAttentionAllYou2017]] featuring multi-headed self-attention and a norm-last layer arrangement. Continuous inputs are normalized using layer norm ([[@baLayerNormalization2016]]) , concatenated with the contextual embeddings, and input into a multi-layer peceptron. More specifically, [[@huangTabTransformerTabularData2020]] (p. 4; 12) use a feed-forward network with two hidden layers, whilst other architectures and even non-deep models, such as [[🐈gradient-boosting]], are applicable. Thus, for strictly continuous inputs, the network collapses to a multi-layer perceptron with layer normalization.
 
-Due to the tabular nature of the data, with features arranged in a row-column fashion, the token embedding (see chapter [[🛌Token Embedding]]) is replaced for a *column embedding*. Also the notation needs to be adapted to the tabular domain. We denote the data set with $D:=\left\{\left(\mathbf{x}_k, y_k\right) \right\}_{k=1,\cdots N}$ identified with $\left[N_{\mathrm{D}}\right]:=\left\{1, \ldots, N_{\mathrm{D}}\right\}$.  Each tuple $(\boldsymbol{x}, y)$ represents a row in the data set, and consist of the binary classification target $y_k \in \mathbb{R}$ and the vector of features $\boldsymbol{x} = \left\{\boldsymbol{x}_{\text{cat}}, \boldsymbol{x}_{\text{cont}}\right\}$, where $x_{\text{cont}} \in \mathbb{R}^c$ denotes all $c$ continuous features and $\boldsymbol{x}_{\text{cat}}\in \mathbb{R}^{m}$ all $m$ categorical features. 
+Due to the tabular nature of the data, with features arranged in a row-column fashion, the token embedding (see chapter [[🛌Token Embedding]]) is replaced for a *column embedding*. Also the notation needs to be adapted to the tabular domain. We denote the data set with $D:=\left\{\left(\mathbf{x}_k, y_k\right) \right\}_{k=1,\cdots N}$ identified with $\left[N_{\mathrm{D}}\right]:=\left\{1, \ldots, N_{\mathrm{D}}\right\}$.  Each tuple $(\boldsymbol{x}, y)$ represents a row in the data set, and consist of the binary classification target $y_k \in \mathbb{R}$ and the vector of features 
+$\boldsymbol{x} = \left\{\boldsymbol{x}_{\text{cat}}, \boldsymbol{x}_{\text{cont}}\right\}$, where $x_{\text{cont}} \in \mathbb{R}^c$ denotes all $c$ continuous features and $\boldsymbol{x}_{\text{cat}}\in \mathbb{R}^{m}$ all $m$ categorical features. We denote the cardinality of the $j$-th feature with $j \in 1, \cdots m$ with $N_{C_j}$. 
 
+In chapter [[🛌Token Embedding]], one lookup table suffices for storing the embeddings of all tokens within the sequence. Due to the heterogenous nature of tabular data, every categorical column is independent from all $m-1$ other categorical columns. Thus, every column requires learning their own embedding matrix. 
+The *feature-specific embeddings* are queried with a unique integer key $c_j \in C_j \cong\left[N_{\mathrm{C_j}}\right]$ from the learned embedding matrix $W_j \in \mathbb{R}^{e_d \times N_{C_j}}$ of the categorical column. Similar to the [[🛌Token Embedding]], a previous label encoding must be employed, to map the categories to their unique integer keys.
+%%
+They use +1 class, for NaN. Should already be addressed in pre-processing or imputed i. e. become their own category. Thus, I think it's ok, to not dwell on this here, as it is part of NC already?
+%%
+Additionally, a *shared embedding* is learned. This embedding is equal for all categories of one feature and is added or concatenated to the feature-specific embeddings to enable the model to distinguish classes in one column from those in other columns ([[@huangTabTransformerTabularData2020]] p. 10). For the variant, where the shared embedding is added element-wisely, the embedding matrix $W_S$ is of dimension $\mathbb{R}^{e_d \times m}$ .
+
+Overall, the joint *column embedding* of $x_j$ is given by:
+$$
+e_j = W_j[:c_j] + W_S[:j].
+$$
 %%
 Notation adapted from [[@prokhorenkovaCatBoostUnbiasedBoosting2018]], [[@huangTabTransformerTabularData2020]]) and [[@phuongFormalAlgorithmsTransformers2022]]
 Classification (ETransformer). Given a vocabulary $V$ and a set of classes $\left[N_{\mathrm{C}}\right]$, let $\left(x_n, c_n\right) \in$ $V^* \times\left[N_{\mathrm{C}}\right]$ for $n \in\left[N_{\text {data }}\right]$ be an i.i.d. dataset of sequence-class pairs sampled from $P(x, c)$. The goal in classification is to learn an estimate of the conditional distribution $P(c \mid x)$.
@@ -21,22 +33,7 @@ Assume we observe a dataset of examples $\mathcal{D}=\left\{\left(\mathbf{x}_k,
 Let $(\boldsymbol{x}, y)$ denote a feature-target pair, where $\boldsymbol{x} \equiv$ $\left\{\boldsymbol{x}_{\text {cat }}, \boldsymbol{x}_{\text {cont }}\right\}$. The $\boldsymbol{x}_{\text {cat }}$ denotes all the categorical features and $x_{\text {cont }} \in \mathbb{R}^c$ denotes all of the $c$ continuous features. Let $\boldsymbol{x}_{\text {cat }} \equiv\left\{x_1, x_2, \cdots, x_m\right\}$ with each $x_i$ being a categorical feature, for $i \in\{1, \cdots, m\}$. (from [[@huangTabTransformerTabularData2020]] )
 %%
 
-In chapter [[🛌Token Embedding]], one lookup table suffices for storing the embeddings of all tokens within the sequence. Due to the heterogeneous (?)  nature of tabular data, every categorical column is independent of all $m-1$ other categorical columns. Thus, every column requires learning their own embedding matrix. As such, each column is embedded separately using a *column embedding*. For every $i$-th categorical column with $i \in {1,\cdots m}$ the 
-
-<mark style="background: #FF5582A6;">TODO:</mark> Think about the projection / look up in code.
-
-%%
-
-![[column-embeddings.png]]
-
-The embedding matrix is now dependent on the on the ca table to retrieve the embedding vector $e \in \mathbb{R}^{d_{\mathrm{e}}}$  from a learned, embedding matrix $W_e \in \mathbb{R}^{d_{\mathrm{e}} \times N_{\mathrm{V}}}$ with a token-id $v \in {1,\cdots m}$ as shown :
-$$
-\tag{1}
-e=W_e[:, v].
-$$
-
-%%
-Note that categorical columns may be arranged in an arbitrary order and that the Transformer blocks are (... equivariant?), Thus, no [[🧵Positional encoding]] is required to inject the order. Analogous to chapter [[🤖Transformer]], the column embedding of each row is subsequently passed through several transformer layers, ultimately resulting in contextualized embeddings $\tilde{V} \in \mathbb{R}^{d_{\text {out}} \times m}$.  At the end of the encoder, the contextual embeddings are flattened and concatenated with the continuous inputs into a ($d_\text{dim}  \times m + c$)-dimensional vector, which serves as input to the multi-layer perceptron ([[@huangTabTransformerTabularData2020]] (p. 3)). Like before, a linear layer and softmax activation <mark style="background: #FFB8EBA6;">(actually it's just a sigmoid due to the binary case, which is less computationally demanding for the binary case)</mark> are used to retrieve the class probabilities.
+Note that categorical columns may be arranged in an arbitrary order and that the Transformer blocks are (... equivariant?), Thus, no [[🧵Positional encoding]] is required to inject the order. Analogous to chapter [[🤖Transformer]], the embedding of each row, or $X = [e_1, \cdots, e_m]$, are subsequently passed through several transformer layers, ultimately resulting in contextualized embeddings.  At the end of the encoder, the contextual embeddings are flattened and concatenated with the continuous inputs into a ($e_{d}  \times m + c$)-dimensional vector, which serves as input to the multi-layer perceptron ([[@huangTabTransformerTabularData2020]] (p. 3)). Like before, a linear layer and softmax activation <mark style="background: #FFB8EBA6;">(actually it's just a sigmoid due to the binary case, which is less computationally demanding for the binary case)</mark> are used to retrieve the class probabilities.
 
 In large-scale experiments [[@huangTabTransformerTabularData2020]]  (p. 5 f.) can show, that the use of contextual embeddings elevates both the robustness to noise and missing data of the model. For various binary classification tasks, the TabTransformer outperforms other deep learning models e. g., vanilla multi-layer perceptrons in terms of *area under the curve* (AUC) and can compete with [[🐈gradient-boosting]].  
 
diff --git "a/references/obsidian/chapters/\360\237\244\226Training of the Transformer.md" "b/references/obsidian/chapters/\360\237\244\226Training of the Transformer.md"
index bc23d1a5..03f2f5b0 100644
--- "a/references/obsidian/chapters/\360\237\244\226Training of the Transformer.md"	
+++ "b/references/obsidian/chapters/\360\237\244\226Training of the Transformer.md"	
@@ -1,5 +1,7 @@
 #lr-warmup #lr-scheduling 
 
+
+- training of the transformer has been found non-trivial[[@liuUnderstandingDifficultyTraining2020]]
 - introduce notion of effective batch size (batch size when training is split across multiple gpus; see [[🧠Deep Learning Methods/Transformer/@popelTrainingTipsTransformer2018]] p. 46)
 - report or store training times?
 - In case of diverged training, try gradient clipping and/or more warmup steps. (found in [[🧠Deep Learning Methods/Transformer/@popelTrainingTipsTransformer2018]])
@@ -8,6 +10,9 @@
 - One might has to adjust the lr when scaling across multiple gpus [[@poppeSensitivityVPINChoice2016]] contains a nice discussion.
 - Use weight decay of 0.1 for a small amount of regularization [[@loshchilovDecoupledWeightDecay2019]].
 
+- On activation function see [[@shazeerGLUVariantsImprove2020]]
+
+
 - log gradients and loss using `wandb.watch` as shown here https://www.youtube.com/watch?v=k6p-gqxJfP4 with `wandb.log({"epoch":epoch, "loss":loss}, step)` (nested in `if ((batch_ct +1) % 25) == 0:`) and `wandb.watch(model, criterion, log="all", log_freq=10)`
 - watch out for exploding and vanishing gradients
 - distillation, learning rate warmup, learning rate decay (not used but could improve training times and maybe accuracy) ([[@gorishniyRevisitingDeepLearning2021]])
diff --git "a/references/obsidian/chapters/\360\237\244\226transformer.md" "b/references/obsidian/chapters/\360\237\244\226transformer.md"
index e7a3b93c..6938a3a7 100644
--- "a/references/obsidian/chapters/\360\237\244\226transformer.md"
+++ "b/references/obsidian/chapters/\360\237\244\226transformer.md"
@@ -2,6 +2,8 @@
 ![[classical_transformer_architecture.png]]
 (own drawing after [[@daiTransformerXLAttentiveLanguage2019]], use L instead of N)
 
+![[Pasted image 20230115060830.png]]
+
 In the subsequent sections we introduce the classical Transformer of [[@vaswaniAttentionAllYou2017]]. Our focus on introducing the central building blocks like self-attention and multi-headed attention.  We then transfer the concepts to the tabular domain by covering [[🤖TabTransformer]] and [[🤖FTTransformer]]. Throughout the work we adhere to a notation suggested in [[@phuongFormalAlgorithmsTransformers2022]].
 
 - encoder/ decoder models $\approx$ sequence-to-sequence model
@@ -35,7 +37,7 @@ Open:
 - [ ] Residual connections
 - [ ] Layer Norm, Pre-Norm, and Post-Norm
 - [x] TabTransformer
-- [ ] FTTransformer
+- [x] FTTransformer
 - [ ] Pre-Training
 - [ ] Embeddings of categorical / continuous data
 - [ ] Selection of supervised approaches
@@ -106,8 +108,8 @@ feature importance evaluation is a non-trivial problem due to missing ground tru
 - intuition behind multi-head and self-attention e. g. cosine similarity, key and querying mechanism: https://www.youtube.com/watch?v=mMa2PmYJlCo&list=PL86uXYUJ7999zE8u2-97i4KG_2Zpufkfb
 
 
-
-
+- Our analysis starts from the observation: the original Transformer (referred to as Post-LN) is less robust than its Pre-LN variant2 (Baevski and Auli, 2019; Xiong et al., 2019; Nguyen and Salazar, 2019). (from [[@liuUnderstandingDifficultyTraining2020]])
+- motivation to switch 
 
 - General Introduction: [[@vaswaniAttentionAllYou2017]]
 - What is Attentition?
diff --git "a/references/obsidian/\360\237\247\240Deep Learning Methods/Transformer/@huangTabTransformerTabularData2020.md" "b/references/obsidian/\360\237\247\240Deep Learning Methods/Transformer/@huangTabTransformerTabularData2020.md"
index e8488fdf..57004227 100644
--- "a/references/obsidian/\360\237\247\240Deep Learning Methods/Transformer/@huangTabTransformerTabularData2020.md"	
+++ "b/references/obsidian/\360\237\247\240Deep Learning Methods/Transformer/@huangTabTransformerTabularData2020.md"	
@@ -71,6 +71,9 @@ $$
 $$
 Below, we explain the Transformer layers and column embedding.
 
+Important details on column embedding:
+Column Embedding. The first study is on the choice of column embedding - shared parameters $\boldsymbol{c}_{\phi_i}$ across the embeddings of multiple classes in column $i$ for $i$ $\{1,2, \ldots, m\}$. In particular, we study the optimal dimension of $\boldsymbol{c}_{\phi_i}, \ell$. An alternative choice is to element-wisely add the unique identifier $\boldsymbol{c}_{\phi_i}$ and feature-value specific embeddings $\boldsymbol{w}_{\phi_{i j}}$ rather than concatenating them. In that case, both the dimension of $\boldsymbol{c}_{\phi_i}$ and $\boldsymbol{w}_{\phi_{i j}}$ are equal to the dimension of embedding $d$. The goal of having column embedding is to
+
 ## Comparsion FT-Transformer and TabTransformer
 
 ![[ft-tab-transformer.png]]
@@ -82,6 +85,20 @@ Below, we explain the Transformer layers and column embedding.
 -   In addition to the purely supervised regime, the authors propose a semi-supervised approach leveraging unsupervised pre-training.
 -   Looking at the average AUC across 15 datasets, the proposed TabTransformer (82.8) is on par with gradient-boosted trees (82.9).
 
+
+## Notes from the keras blog
+https://keras.io/examples/structured_data/tabtransformer/#experiment-2-tabtransformer
+
+1.  All the categorical features are encoded as embeddings, using the same `embedding_dims`. This means that each value in each categorical feature will have its own embedding vector.
+2.  A column embedding, one embedding vector for each categorical feature, is added (point-wise) to the categorical feature embedding.
+3.  The embedded categorical features are fed into a stack of Transformer blocks. Each Transformer block consists of a multi-head self-attention layer followed by a feed-forward layer.
+4.  The outputs of the final Transformer layer, which are the _contextual embeddings_ of the categorical features, are concatenated with the input numerical features, and fed into a final MLP block.
+5.  A `softmax` classifer is applied at the end of the model.
+
+The [paper](https://arxiv.org/abs/2012.06678) discusses both addition and concatenation of the column embedding in the _Appendix: Experiment and Model Details_ section. The architecture of TabTransformer is shown below, as presented in the paper.
+
+
+
 ## Notes from Talk by Zohar Karnin
 (see here: https://www.youtube.com/watch?v=-ZdHhyQsvRc)
 
diff --git a/reports/Content/bibliography.bib b/reports/Content/bibliography.bib
index 6db1c026..758d8d2a 100644
--- a/reports/Content/bibliography.bib
+++ b/reports/Content/bibliography.bib
@@ -1,10 +1,3 @@
-@misc{262588213843476PythonPseudocodeConverter,
-  title = {Python to {{Pseudocode}} Converter},
-  author = {262588213843476},
-  journal = {Gist},
-  howpublished = {https://gist.github.com/BlueNexus/599962d03a1b52a8d5f595dabd51dc34}
-}
-
 @incollection{abeDeepLearningForecasting2018,
   title = {Deep Learning for Forecasting Stock Returns in the Cross-Section},
   booktitle = {Advances in {{Knowledge Discovery}} and {{Data Mining}}},
@@ -209,17 +202,6 @@ @article{baLayerNormalization2016
   archiveprefix = {arXiv}
 }
 
-@misc{baLayerNormalization2016a,
-  title = {Layer {{Normalization}}},
-  author = {Ba, Jimmy Lei and Kiros, Jamie Ryan and Hinton, Geoffrey E.},
-  year = {2016},
-  number = {arXiv:1607.06450},
-  eprint = {1607.06450},
-  eprinttype = {arxiv},
-  publisher = {{arXiv}},
-  archiveprefix = {arXiv}
-}
-
 @book{banachewiczKaggleBookData2022,
   title = {The {{Kaggle Book}}: {{Data Analysis}} and {{Machine Learning}} for {{Competitive Data Science}}},
   author = {Banachewicz, Konrad and Massaron, Luca},
@@ -906,18 +888,6 @@ @article{dieboldComparingPredictiveAccuracy1995
   doi = {10.1080/07350015.1995.10524599}
 }
 
-@misc{DiscordNewWay,
-  title = {{Discord - A New Way to Chat with Friends \& Communities}},
-  journal = {Discord},
-  howpublished = {https://discord.com/login?redirect\_to=\%2Fchannels\%2F595999872222756885\%2F712994588318761020}
-}
-
-@misc{DistillLatestArticles,
-  title = {Distill \textemdash{} {{Latest}} Articles about Machine Learning},
-  journal = {Distill},
-  howpublished = {http://distill.pub/}
-}
-
 @article{Easley_1987,
   title = {{{PRICE}}, {{TRADE SIZE}}, {{AND INFORMATION IN SECURITIES MARKETS}}*},
   author = {Easley, David and O'Hara, Maureen},
@@ -951,6 +921,17 @@ @article{Easley_1996
   pmid = {null}
 }
 
+@article{Easley_1998,
+  title = {Option Volume and Stock Prices: {{Evidence}} on Where Informed Traders Trade},
+  author = {Easley, David and O'Hara, Maureen and Srinivas, P.S.},
+  year = {1998},
+  journal = {null},
+  doi = {null},
+  mag_id = {3122983259},
+  pmcid = {null},
+  pmid = {null}
+}
+
 @article{Easley_2002,
   title = {Is Information Risk a Determinant of Asset Returns},
   author = {Easley, David and Hvidkjaer, Soeren and O'Hara, Maureen},
@@ -1076,13 +1057,6 @@ @article{fawziDiscoveringFasterMatrix2022
   doi = {10.1038/s41586-022-05172-4}
 }
 
-@misc{FeatureEngineeringHandling00:00:00-05:00,
-  title = {Feature Engineering - Handling Cyclical Features},
-  year = {00:00:00-05:00},
-  journal = {From Neutrinos to Data Science},
-  howpublished = {http://blog.davidkaleko.com/feature-engineering-cyclical-features.html}
-}
-
 @article{fedeniaMachineLearningCorporate2021,
   title = {Machine {{Learning}} in the {{Corporate Bond Market}} and {{Beyond}}: {{A New Classifier}}},
   author = {Fedenia, Mark A. and Nam, Seunghan and Ronen, Tavy},
@@ -1148,10 +1122,6 @@ @article{finucaneDirectTestMethods2000
   doi = {10.2307/2676255}
 }
 
-@misc{FiveYearsNeural,
-  title = {Five {{Years With Neural Networks}} and a {{Research Retreat}} \textendash{} {{Timo Denk}}'s {{Blog}}}
-}
-
 @article{Fleming_1996,
   title = {Trading Costs and the Relative Rates of Price Discovery in Stock, Futures, and Option Markets},
   author = {Fleming, Jeff and Ostdiek, Barbara and Whaley, Robert E.},
@@ -1369,17 +1339,6 @@ @misc{GradientBoostPart
   howpublished = {https://www.youtube.com/watch?v=3CC4N4z3GJc}
 }
 
-@article{Grammig_2001,
-  title = {Knowing Me, Knowing You: {{Trader}} Anonymity and Informed Trading in Parallel Markets *},
-  author = {Grammig, Joachim and Schiereck, Dirk and Theissen, Erik},
-  year = {2001},
-  journal = {Journal of Financial Markets},
-  doi = {10.1016/s1386-4181(01)00018-0},
-  mag_id = {1540952371},
-  pmcid = {null},
-  pmid = {null}
-}
-
 @article{grammigDivergingRoadsTheoryBased2020,
   title = {Diverging Roads: Theory-Based vs. Machine Learning-Implied Stock Risk Premia},
   author = {Grammig, Joachim and Hanenberg, Constantin and Schlag, Christian and S{\"o}nksen, Jantje},
@@ -1548,17 +1507,6 @@ @article{hasbrouckTradesQuotesInventories1988
   doi = {10.1016/0304-405X(88)90070-0}
 }
 
-@article{Hastie_2001,
-  title = {The Elements of Statistical Learning},
-  author = {Hastie, Trevor and Tibshirani, Robert and Friedman, Jerome H. and Friedman, Jerome H.},
-  year = {2001},
-  journal = {null},
-  doi = {null},
-  mag_id = {1480376833},
-  pmcid = {null},
-  pmid = {null}
-}
-
 @book{hastietrevorElementsStatisticalLearning2009,
   title = {The {{Elements}} of {{Statistical Learning}}},
   author = {Hastie, Trevor, Sami and Friedman, Harry and Tibshirani, Robert},
@@ -1843,6 +1791,17 @@ @inproceedings{jinDatadrivenApproachPredict2015
   doi = {10.1109/CSNT.2015.25}
 }
 
+@misc{jinPruningEffectGeneralization2022,
+  title = {Pruning's {{Effect}} on {{Generalization Through}} the {{Lens}} of {{Training}} and {{Regularization}}},
+  author = {Jin, Tian and Carbin, Michael and Roy, Daniel M. and Frankle, Jonathan and Dziugaite, Gintare Karolina},
+  year = {2022},
+  number = {arXiv:2210.13738},
+  eprint = {2210.13738},
+  eprinttype = {arxiv},
+  publisher = {{arXiv}},
+  archiveprefix = {arXiv}
+}
+
 @article{johnsonSurveyDeepLearning2019,
   title = {Survey on Deep Learning with Class Imbalance},
   author = {Johnson, Justin M. and Khoshgoftaar, Taghi M.},
@@ -2681,6 +2640,17 @@ @misc{ntakourisTimeSeriesTransformer2021
   howpublished = {https://towardsdatascience.com/the-time-series-transformer-2a521a0efad3}
 }
 
+@article{O'Hara_2001,
+  title = {The Accuracy of Trade Classification Rules: {{Evidence}} from {{NASDAQ}}},
+  author = {O'Hara, Maureen and Ellis, Katrina and Michaely, Roni},
+  year = {2001},
+  journal = {null},
+  doi = {null},
+  mag_id = {3121731372},
+  pmcid = {null},
+  pmid = {null}
+}
+
 @inproceedings{obthongSurveyMachineLearning2020,
   title = {A {{Survey}} on {{Machine Learning}} for {{Stock Price Prediction}}: {{Algorithms}} and {{Techniques}}:},
   booktitle = {Proceedings of the 2nd {{International Conference}} on {{Finance}}, {{Economics}}, {{Management}} and {{IT Business}}},
@@ -3036,16 +3006,6 @@ @article{Rosenthal_2008
   pmid = {null}
 }
 
-@article{rosenthalModelingTradeDirection2012,
-  title = {Modeling {{Trade Direction}}},
-  author = {Rosenthal, D. W. R.},
-  year = {2012},
-  journal = {Journal of Financial Econometrics},
-  volume = {10},
-  number = {2},
-  doi = {10.1093/jjfinec/nbr014}
-}
-
 @article{rossiMachineLearning,
   title = {Predicting Stock Market Returns with Machine Learning},
   author = {Rossi, Alberto},
@@ -3653,17 +3613,6 @@ @book{wittenDataMiningPractical2017
   address = {{Amsterdam}}
 }
 
-@misc{xieAdanAdaptiveNesterov2022,
-  title = {Adan: {{Adaptive Nesterov Momentum Algorithm}} for {{Faster Optimizing Deep Models}}},
-  author = {Xie, Xingyu and Zhou, Pan and Li, Huan and Lin, Zhouchen and Yan, Shuicheng},
-  year = {2022},
-  number = {arXiv:2208.06677},
-  eprint = {2208.06677},
-  eprinttype = {arxiv},
-  publisher = {{arXiv}},
-  archiveprefix = {arXiv}
-}
-
 @misc{xiongLayerNormalizationTransformer2020,
   title = {On {{Layer Normalization}} in the {{Transformer Architecture}}},
   author = {Xiong, Ruibin and Yang, Yunchang and He, Di and Zheng, Kai and Zheng, Shuxin and Xing, Chen and Zhang, Huishuai and Lan, Yanyan and Wang, Liwei and Liu, Tie-Yan},
@@ -3675,17 +3624,6 @@ @misc{xiongLayerNormalizationTransformer2020
   archiveprefix = {arXiv}
 }
 
-@misc{xiongLayerNormalizationTransformer2020a,
-  title = {On {{Layer Normalization}} in the {{Transformer Architecture}}},
-  author = {Xiong, Ruibin and Yang, Yunchang and He, Di and Zheng, Kai and Zheng, Shuxin and Xing, Chen and Zhang, Huishuai and Lan, Yanyan and Wang, Liwei and Liu, Tie-Yan},
-  year = {2020},
-  number = {arXiv:2002.04745},
-  eprint = {2002.04745},
-  eprinttype = {arxiv},
-  publisher = {{arXiv}},
-  archiveprefix = {arXiv}
-}
-
 @article{yangStockPricePrediction2021,
   title = {Stock {{Price Prediction Based}} on {{XGBoost}} and {{LightGBM}}},
   author = {Yang, Yue and Wu, Yang and Wang, Peikun and Jiali, Xu},