added more content about lemmatization and gpt

Kubus42 · Apr 6, 2024 · fa68f17 · fa68f17
1 parent a73578b
commit fa68f17
Show file tree

Hide file tree

Showing 47 changed files with 847 additions and 1,550 deletions.
diff --git a/_freeze/embeddings/clustering/execute-results/html.json b/_freeze/embeddings/clustering/execute-results/html.json
@@ -2,7 +2,7 @@
   "hash": "316140ef068cc0a03b26ac3926222783",
   "result": {
     "engine": "jupyter",
-    "markdown": "---\ntitle: Clustering\nformat:\n  html:\n    code-fold: false\n---\n\n::: {#21e25566 .cell execution_count=1}\n``` {.python .cell-code}\n# prerequisites\n\nimport os\nfrom llm_utils.client import get_openai_client, OpenAIModels\n\nMODEL = OpenAIModels.EMBED.value\n\n# get the OpenAI client\nclient = get_openai_client(\n  model=MODEL,\n  config_path=os.environ.get(\"CONFIG_PATH\")\n)\n```\n:::\n\n\n::: {#c1e9c4ab .cell execution_count=2}\n``` {.python .cell-code}\nimport matplotlib.pyplot as plt\n\nfrom sklearn.manifold import TSNE\n```\n:::\n\n\n::: {#0adb1203 .cell execution_count=3}\n``` {.python .cell-code}\n# Define a list of words to cluster\nwords = [\"king\", \"queen\", \"man\", \"woman\", \"apple\", \"banana\", \"grapes\", \"cat\", \"dog\", \"happy\", \"sad\"]\n\n# Get embeddings for the words\nresponse = client.embeddings.create(\n  input=words,\n  model=MODEL\n)\n\nembeddings = [emb.embedding for emb in response.data]\n```\n:::\n\n\n::: {#518d9c00 .cell execution_count=4}\n``` {.python .cell-code}\n# do the clustering\nimport numpy as np\nfrom sklearn.cluster import KMeans\n\nn_clusters = 5\n\n# define the model\nkmeans = KMeans(\n  n_clusters=n_clusters,\n  n_init=\"auto\",\n  random_state=2 # do this to get the same output\n)\n\n# fit the model to the data\nkmeans.fit(np.array(embeddings))\n\n# get the cluster labels\ncluster_labels = kmeans.labels_\n```\n:::\n\n\n::: {#cell-tsne-visualization of clustering .cell execution_count=5}\n``` {.python .cell-code}\nimport matplotlib.pyplot as plt\n\nfrom sklearn.manifold import TSNE\n\n# Apply t-SNE dimensionality reduction\ntsne = TSNE(\n  n_components=2, \n  random_state=42,\n  perplexity=5 # see documentation to set this correctly\n)\nembeddings_2d = tsne.fit_transform(np.array(embeddings))\n\n# Define a color map for clusters\ncolors = plt.cm.viridis(np.linspace(0, 1, n_clusters))\n\n# Plot the embeddings in a two-dimensional scatter plot\nplt.figure(figsize=(10, 8))\nfor i, word in enumerate(words):\n    x, y = embeddings_2d[i]\n    cluster_label = cluster_labels[i]\n    color = colors[cluster_label]\n    plt.scatter(x, y, marker='o', color=color)\n    plt.text(x, y, word, fontsize=9)\n\nplt.xlabel(\"t-SNE dimension 1\")\nplt.ylabel(\"t-SNE dimension 2\")\nplt.grid(True)\nplt.xticks([])\nplt.yticks([])\nplt.show()\n```\n\n::: {.cell-output .cell-output-display}\n![t-SNE visualization of clustering word embeddings](clustering_files/figure-html/tsne-visualization-of-clustering-output-1.png){#tsne-visualization-of-clustering width=789 height=629}\n:::\n:::\n\n\n",
+    "markdown": "---\ntitle: Clustering\nformat:\n  html:\n    code-fold: false\n---\n\n::: {#9ca983c1 .cell execution_count=1}\n``` {.python .cell-code}\n# prerequisites\n\nimport os\nfrom llm_utils.client import get_openai_client, OpenAIModels\n\nMODEL = OpenAIModels.EMBED.value\n\n# get the OpenAI client\nclient = get_openai_client(\n  model=MODEL,\n  config_path=os.environ.get(\"CONFIG_PATH\")\n)\n```\n:::\n\n\n::: {#2f825ad9 .cell execution_count=2}\n``` {.python .cell-code}\nimport matplotlib.pyplot as plt\n\nfrom sklearn.manifold import TSNE\n```\n:::\n\n\n::: {#74f945e4 .cell execution_count=3}\n``` {.python .cell-code}\n# Define a list of words to cluster\nwords = [\"king\", \"queen\", \"man\", \"woman\", \"apple\", \"banana\", \"grapes\", \"cat\", \"dog\", \"happy\", \"sad\"]\n\n# Get embeddings for the words\nresponse = client.embeddings.create(\n  input=words,\n  model=MODEL\n)\n\nembeddings = [emb.embedding for emb in response.data]\n```\n:::\n\n\n::: {#2f2d689d .cell execution_count=4}\n``` {.python .cell-code}\n# do the clustering\nimport numpy as np\nfrom sklearn.cluster import KMeans\n\nn_clusters = 5\n\n# define the model\nkmeans = KMeans(\n  n_clusters=n_clusters,\n  n_init=\"auto\",\n  random_state=2 # do this to get the same output\n)\n\n# fit the model to the data\nkmeans.fit(np.array(embeddings))\n\n# get the cluster labels\ncluster_labels = kmeans.labels_\n```\n:::\n\n\n::: {#cell-tsne-visualization of clustering .cell execution_count=5}\n``` {.python .cell-code}\nimport matplotlib.pyplot as plt\n\nfrom sklearn.manifold import TSNE\n\n# Apply t-SNE dimensionality reduction\ntsne = TSNE(\n  n_components=2, \n  random_state=42,\n  perplexity=5 # see documentation to set this correctly\n)\nembeddings_2d = tsne.fit_transform(np.array(embeddings))\n\n# Define a color map for clusters\ncolors = plt.cm.viridis(np.linspace(0, 1, n_clusters))\n\n# Plot the embeddings in a two-dimensional scatter plot\nplt.figure(figsize=(10, 8))\nfor i, word in enumerate(words):\n    x, y = embeddings_2d[i]\n    cluster_label = cluster_labels[i]\n    color = colors[cluster_label]\n    plt.scatter(x, y, marker='o', color=color)\n    plt.text(x, y, word, fontsize=9)\n\nplt.xlabel(\"t-SNE dimension 1\")\nplt.ylabel(\"t-SNE dimension 2\")\nplt.grid(True)\nplt.xticks([])\nplt.yticks([])\nplt.show()\n```\n\n::: {.cell-output .cell-output-display}\n![t-SNE visualization of clustering word embeddings](clustering_files/figure-html/tsne-visualization-of-clustering-output-1.png){#tsne-visualization-of-clustering width=789 height=629}\n:::\n:::\n\n\n",
     "supporting": [
       "clustering_files"
     ],

diff --git a/_freeze/embeddings/embeddings/execute-results/html.json b/_freeze/embeddings/embeddings/execute-results/html.json
@@ -1,8 +1,8 @@
 {
-  "hash": "dab3ad44f7c21587b0f89e7e66481228",
+  "hash": "207e9945de189fc7852f62582cc9a36b",
   "result": {
     "engine": "jupyter",
-    "markdown": "---\ntitle: Embeddings\nformat:\n  html:\n    code-fold: true\n---\n\n- **Word Embeddings**: Word embeddings are dense vector representations of words in a continuous vector space. Each word is mapped to a high-dimensional vector where words with similar meanings or contexts are closer together in the vector space.\n- **Contextual Embeddings**: Contextual embeddings, also known as contextualized word embeddings, capture the contextual information of words based on their surrounding context in a given sentence or document. Unlike traditional word embeddings, contextual embeddings vary depending on the context in which a word appears.\n- **BERT Embeddings**: BERT (Bidirectional Encoder Representations from Transformers) embeddings are contextual embeddings provided by OpenAI. They are generated by pre-training a Transformer-based neural network model on a large corpus of text data using masked language modeling and next sentence prediction tasks.\n- **GPT Embeddings**: GPT (Generative Pre-trained Transformer) embeddings are also contextual embeddings provided by OpenAI. They are generated by pre-training a Transformer-based neural network model on a large corpus of text data using an autoregressive language modeling objective.\n- **Transformer-based Architecture**: Both BERT and GPT embeddings are derived from Transformer-based architectures, which consist of multiple layers of self-attention mechanisms and feed-forward neural networks. These architectures excel at capturing long-range dependencies and contextual information in sequential data.\n- **Pre-trained Models**: OpenAI provides pre-trained BERT and GPT models that have been trained on large-scale text corpora. These pre-trained models can be fine-tuned on specific tasks or domains with labeled data to adapt their knowledge and capabilities to new applications.\n- **Transfer Learning**: BERT and GPT embeddings support transfer learning, where the pre-trained models are used as feature extractors for downstream NLP tasks. By fine-tuning these models on task-specific data, users can leverage the knowledge encoded in the embeddings to achieve state-of-the-art performance on various natural language processing tasks.\n- **Applications**: BERT and GPT embeddings have a wide range of applications in natural language processing tasks such as text classification, named entity recognition, sentiment analysis, question-answering, and more. They provide powerful representations of text data that capture both semantic and syntactic information.\n\n",
+    "markdown": "---\ntitle: Embeddings\nformat:\n  html:\n    code-fold: true\n---\n\n## What are embeddings? \n\n- **Word Embeddings**: Word embeddings are dense vector representations of words in a continuous vector space. Each word is mapped to a high-dimensional vector where words with similar meanings or contexts are closer together in the vector space.\n- **Contextual Embeddings**: Contextual embeddings, also known as contextualized word embeddings, capture the contextual information of words based on their surrounding context in a given sentence or document. Unlike traditional word embeddings, contextual embeddings vary depending on the context in which a word appears.\n- **BERT Embeddings**: BERT (Bidirectional Encoder Representations from Transformers) embeddings are contextual embeddings provided by OpenAI. They are generated by pre-training a Transformer-based neural network model on a large corpus of text data using masked language modeling and next sentence prediction tasks.\n- **GPT Embeddings**: GPT (Generative Pre-trained Transformer) embeddings are also contextual embeddings provided by OpenAI. They are generated by pre-training a Transformer-based neural network model on a large corpus of text data using an autoregressive language modeling objective.\n- **Transformer-based Architecture**: Both BERT and GPT embeddings are derived from Transformer-based architectures, which consist of multiple layers of self-attention mechanisms and feed-forward neural networks. These architectures excel at capturing long-range dependencies and contextual information in sequential data.\n- **Pre-trained Models**: OpenAI provides pre-trained BERT and GPT models that have been trained on large-scale text corpora. These pre-trained models can be fine-tuned on specific tasks or domains with labeled data to adapt their knowledge and capabilities to new applications.\n- **Transfer Learning**: BERT and GPT embeddings support transfer learning, where the pre-trained models are used as feature extractors for downstream NLP tasks. By fine-tuning these models on task-specific data, users can leverage the knowledge encoded in the embeddings to achieve state-of-the-art performance on various natural language processing tasks.\n- **Applications**: BERT and GPT embeddings have a wide range of applications in natural language processing tasks such as text classification, named entity recognition, sentiment analysis, question-answering, and more. They provide powerful representations of text data that capture both semantic and syntactic information.\n\n\n## Matching with embeddings\n\n",
     "supporting": [
       "embeddings_files"
     ],

diff --git a/_freeze/embeddings/visualization/execute-results/html.json b/_freeze/embeddings/visualization/execute-results/html.json
@@ -1,8 +1,8 @@
 {
-  "hash": "d5ff22765fb7d3cedaf671c0a3260196",
+  "hash": "2641c2092586c91c7b892401fcf1812a",
   "result": {
     "engine": "jupyter",
-    "markdown": "---\ntitle: Visualization\nformat:\n  html:\n    code-fold: false\n---\n\n::: {#a1745e0e .cell execution_count=1}\n``` {.python .cell-code}\n# prerequisites\n\nimport os\nfrom llm_utils.client import get_openai_client, OpenAIModels\n\nMODEL = OpenAIModels.EMBED.value\n\n# get the OpenAI client\nclient = get_openai_client(\n    model=MODEL,\n    config_path=os.environ.get(\"CONFIG_PATH\")\n)\n```\n:::\n\n\n::: {#1a762f3f .cell execution_count=2}\n``` {.python .cell-code}\nimport matplotlib.pyplot as plt\n\nfrom sklearn.manifold import TSNE\n```\n:::\n\n\n::: {#b0dfb78f .cell execution_count=3}\n``` {.python .cell-code}\n# Define a list of words to visualize\nwords = [\"king\", \"queen\", \"man\", \"woman\", \"apple\", \"banana\", \"grapes\", \"cat\", \"dog\", \"happy\", \"sad\"]\n\n# Get embeddings for the words\nresponse = client.embeddings.create(\n    input=words,\n    model=MODEL\n)\n\nembeddings = [emb.embedding for emb in response.data]\n```\n:::\n\n\n::: {#cell-tsne-visualization .cell execution_count=4}\n``` {.python .cell-code}\nimport numpy as np\nimport matplotlib.pyplot as plt\n\nfrom sklearn.manifold import TSNE\n\n# Apply t-SNE dimensionality reduction\ntsne = TSNE(\n    n_components=2, \n    random_state=42,\n    perplexity=5 # see documentation to set this correctly\n)\nembeddings_2d = tsne.fit_transform(np.array(embeddings))\n\n# Plot the embeddings in a two-dimensional scatter plot\nplt.figure(figsize=(10, 8))\nfor i, word in enumerate(words):\n    x, y = embeddings_2d[i]\n    plt.scatter(x, y, marker='o', color='red')\n    plt.text(x, y, word, fontsize=9)\n\nplt.xlabel(\"t-SNE dimension 1\")\nplt.ylabel(\"t-SNE dimension 2\")\nplt.grid(True)\nplt.xticks([])\nplt.yticks([])\nplt.show()\n```\n\n::: {.cell-output .cell-output-display}\n![t-SNE visualization of word embeddings](visualization_files/figure-html/tsne-visualization-output-1.png){#tsne-visualization width=789 height=629}\n:::\n:::\n\n\n",
+    "markdown": "---\ntitle: Visualization & clustering of embeddings\nformat:\n  html:\n    code-fold: false\n---\n\n## Visualization of embeddings\n\n::: {#c274dbd5 .cell execution_count=1}\n``` {.python .cell-code}\n# prerequisites\n\nimport os\nfrom llm_utils.client import get_openai_client, OpenAIModels\n\nMODEL = OpenAIModels.EMBED.value\n\n# get the OpenAI client\nclient = get_openai_client(\n    model=MODEL,\n    config_path=os.environ.get(\"CONFIG_PATH\")\n)\n```\n:::\n\n\n::: {#67f8f501 .cell execution_count=2}\n``` {.python .cell-code}\nimport matplotlib.pyplot as plt\n\nfrom sklearn.manifold import TSNE\n```\n:::\n\n\n::: {#a7acae04 .cell execution_count=3}\n``` {.python .cell-code}\n# Define a list of words to visualize\nwords = [\"king\", \"queen\", \"man\", \"woman\", \"apple\", \"banana\", \"grapes\", \"cat\", \"dog\", \"happy\", \"sad\"]\n\n# Get embeddings for the words\nresponse = client.embeddings.create(\n    input=words,\n    model=MODEL\n)\n\nembeddings = [emb.embedding for emb in response.data]\n```\n:::\n\n\n::: {#cell-tsne-visualization .cell execution_count=4}\n``` {.python .cell-code}\nimport numpy as np\nimport matplotlib.pyplot as plt\n\nfrom sklearn.manifold import TSNE\n\n# Apply t-SNE dimensionality reduction\ntsne = TSNE(\n    n_components=2, \n    random_state=42,\n    perplexity=5 # see documentation to set this correctly\n)\nembeddings_2d = tsne.fit_transform(np.array(embeddings))\n\n# Plot the embeddings in a two-dimensional scatter plot\nplt.figure(figsize=(10, 8))\nfor i, word in enumerate(words):\n    x, y = embeddings_2d[i]\n    plt.scatter(x, y, marker='o', color='red')\n    plt.text(x, y, word, fontsize=9)\n\nplt.xlabel(\"t-SNE dimension 1\")\nplt.ylabel(\"t-SNE dimension 2\")\nplt.grid(True)\nplt.xticks([])\nplt.yticks([])\nplt.show()\n```\n\n::: {.cell-output .cell-output-display}\n![t-SNE visualization of word embeddings](visualization_files/figure-html/tsne-visualization-output-1.png){#tsne-visualization width=789 height=629}\n:::\n:::\n\n\n## Clustering of embeddings\n\n::: {#a5570feb .cell execution_count=5}\n``` {.python .cell-code}\n# do the clustering\nimport numpy as np\nfrom sklearn.cluster import KMeans\n\nn_clusters = 5\n\n# define the model\nkmeans = KMeans(\n  n_clusters=n_clusters,\n  n_init=\"auto\",\n  random_state=2 # do this to get the same output\n)\n\n# fit the model to the data\nkmeans.fit(np.array(embeddings))\n\n# get the cluster labels\ncluster_labels = kmeans.labels_\n```\n:::\n\n\n::: {#cell-tsne-visualization of clustering .cell execution_count=6}\n``` {.python .cell-code}\nimport matplotlib.pyplot as plt\n\nfrom sklearn.manifold import TSNE\n\n# Apply t-SNE dimensionality reduction\ntsne = TSNE(\n  n_components=2, \n  random_state=42,\n  perplexity=5 # see documentation to set this correctly\n)\nembeddings_2d = tsne.fit_transform(np.array(embeddings))\n\n# Define a color map for clusters\ncolors = plt.cm.viridis(np.linspace(0, 1, n_clusters))\n\n# Plot the embeddings in a two-dimensional scatter plot\nplt.figure(figsize=(10, 8))\nfor i, word in enumerate(words):\n    x, y = embeddings_2d[i]\n    cluster_label = cluster_labels[i]\n    color = colors[cluster_label]\n    plt.scatter(x, y, marker='o', color=color)\n    plt.text(x, y, word, fontsize=9)\n\nplt.xlabel(\"t-SNE dimension 1\")\nplt.ylabel(\"t-SNE dimension 2\")\nplt.grid(True)\nplt.xticks([])\nplt.yticks([])\nplt.show()\n```\n\n::: {.cell-output .cell-output-display}\n![t-SNE visualization of word embedding clusters](visualization_files/figure-html/tsne-visualization-of-clustering-output-1.png){#tsne-visualization-of-clustering width=789 height=629}\n:::\n:::\n\n\n",
     "supporting": [
       "visualization_files"
     ],

diff --git a/...-visualization-of-clustering-output-1.png → ...-visualization-of-clustering-output-1.png b/...-visualization-of-clustering-output-1.png → ...-visualization-of-clustering-output-1.png