diff --git a/tamingllms/_build/.doctrees/environment.pickle b/tamingllms/_build/.doctrees/environment.pickle
index 19c7d07..2659c72 100644
Binary files a/tamingllms/_build/.doctrees/environment.pickle and b/tamingllms/_build/.doctrees/environment.pickle differ
diff --git a/tamingllms/_build/.doctrees/markdown/preface.doctree b/tamingllms/_build/.doctrees/markdown/preface.doctree
index b0f353c..7359998 100644
Binary files a/tamingllms/_build/.doctrees/markdown/preface.doctree and b/tamingllms/_build/.doctrees/markdown/preface.doctree differ
diff --git a/tamingllms/_build/.doctrees/markdown/toc.doctree b/tamingllms/_build/.doctrees/markdown/toc.doctree
index 50ce0d7..426fa06 100644
Binary files a/tamingllms/_build/.doctrees/markdown/toc.doctree and b/tamingllms/_build/.doctrees/markdown/toc.doctree differ
diff --git a/tamingllms/_build/.doctrees/notebooks/alignment.doctree b/tamingllms/_build/.doctrees/notebooks/alignment.doctree
index 4c0ba1b..c5826fd 100644
Binary files a/tamingllms/_build/.doctrees/notebooks/alignment.doctree and b/tamingllms/_build/.doctrees/notebooks/alignment.doctree differ
diff --git a/tamingllms/_build/.doctrees/notebooks/cost.doctree b/tamingllms/_build/.doctrees/notebooks/cost.doctree
index 6c13c11..ff51b9b 100644
Binary files a/tamingllms/_build/.doctrees/notebooks/cost.doctree and b/tamingllms/_build/.doctrees/notebooks/cost.doctree differ
diff --git a/tamingllms/_build/.doctrees/notebooks/evals.doctree b/tamingllms/_build/.doctrees/notebooks/evals.doctree
index c3de329..c036c74 100644
Binary files a/tamingllms/_build/.doctrees/notebooks/evals.doctree and b/tamingllms/_build/.doctrees/notebooks/evals.doctree differ
diff --git a/tamingllms/_build/.doctrees/notebooks/input.doctree b/tamingllms/_build/.doctrees/notebooks/input.doctree
new file mode 100644
index 0000000..3493903
Binary files /dev/null and b/tamingllms/_build/.doctrees/notebooks/input.doctree differ
diff --git a/tamingllms/_build/.doctrees/notebooks/local.doctree b/tamingllms/_build/.doctrees/notebooks/local.doctree
index be91ddb..54b5b67 100644
Binary files a/tamingllms/_build/.doctrees/notebooks/local.doctree and b/tamingllms/_build/.doctrees/notebooks/local.doctree differ
diff --git a/tamingllms/_build/.doctrees/notebooks/safety.doctree b/tamingllms/_build/.doctrees/notebooks/safety.doctree
index a49b2c3..c06a8bd 100644
Binary files a/tamingllms/_build/.doctrees/notebooks/safety.doctree and b/tamingllms/_build/.doctrees/notebooks/safety.doctree differ
diff --git a/tamingllms/_build/.doctrees/notebooks/structured_output.doctree b/tamingllms/_build/.doctrees/notebooks/structured_output.doctree
index 526478c..d0960da 100644
Binary files a/tamingllms/_build/.doctrees/notebooks/structured_output.doctree and b/tamingllms/_build/.doctrees/notebooks/structured_output.doctree differ
diff --git a/tamingllms/_build/html/_images/2025.png b/tamingllms/_build/html/_images/2025.png
new file mode 100644
index 0000000..e5e7914
Binary files /dev/null and b/tamingllms/_build/html/_images/2025.png differ
diff --git a/tamingllms/_build/html/_images/anth_contextual.png b/tamingllms/_build/html/_images/anth_contextual.png
new file mode 100644
index 0000000..c8401c0
Binary files /dev/null and b/tamingllms/_build/html/_images/anth_contextual.png differ
diff --git a/tamingllms/data/input/asset_class.png b/tamingllms/_build/html/_images/asset_class.png
similarity index 100%
rename from tamingllms/data/input/asset_class.png
rename to tamingllms/_build/html/_images/asset_class.png
diff --git a/tamingllms/_build/html/_images/cic.png b/tamingllms/_build/html/_images/cic.png
new file mode 100644
index 0000000..5b180e5
Binary files /dev/null and b/tamingllms/_build/html/_images/cic.png differ
diff --git a/tamingllms/_build/html/_images/deep.jpeg b/tamingllms/_build/html/_images/deep.jpeg
new file mode 100644
index 0000000..342a13b
Binary files /dev/null and b/tamingllms/_build/html/_images/deep.jpeg differ
diff --git a/tamingllms/_build/html/_images/deep2.jpeg b/tamingllms/_build/html/_images/deep2.jpeg
new file mode 100644
index 0000000..c370001
Binary files /dev/null and b/tamingllms/_build/html/_images/deep2.jpeg differ
diff --git a/tamingllms/_build/html/_images/diagram1.png b/tamingllms/_build/html/_images/diagram1.png
new file mode 100644
index 0000000..1470769
Binary files /dev/null and b/tamingllms/_build/html/_images/diagram1.png differ
diff --git a/tamingllms/_build/html/_images/docling.png b/tamingllms/_build/html/_images/docling.png
new file mode 100644
index 0000000..143ded9
Binary files /dev/null and b/tamingllms/_build/html/_images/docling.png differ
diff --git a/tamingllms/_build/html/_images/forecast.png b/tamingllms/_build/html/_images/forecast.png
new file mode 100644
index 0000000..905776c
Binary files /dev/null and b/tamingllms/_build/html/_images/forecast.png differ
diff --git a/tamingllms/_build/html/_images/harvard.png b/tamingllms/_build/html/_images/harvard.png
new file mode 100644
index 0000000..0b60f7d
Binary files /dev/null and b/tamingllms/_build/html/_images/harvard.png differ
diff --git a/tamingllms/_build/html/_images/markitdown.png b/tamingllms/_build/html/_images/markitdown.png
new file mode 100644
index 0000000..282503c
Binary files /dev/null and b/tamingllms/_build/html/_images/markitdown.png differ
diff --git a/tamingllms/_build/html/_images/quiz.png b/tamingllms/_build/html/_images/quiz.png
new file mode 100644
index 0000000..a627f7c
Binary files /dev/null and b/tamingllms/_build/html/_images/quiz.png differ
diff --git a/tamingllms/_build/html/_sources/notebooks/input.ipynb b/tamingllms/_build/html/_sources/notebooks/input.ipynb
new file mode 100644
index 0000000..d05669d
--- /dev/null
+++ b/tamingllms/_build/html/_sources/notebooks/input.ipynb
@@ -0,0 +1,2454 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "(input)=\n",
+    "# Managing Input Data\n",
+    "```{epigraph}\n",
+    "One home run is much better than two doubles.\n",
+    "\n",
+    "-- Steve Jobs\n",
+    "```\n",
+    "```{contents}\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Introduction\n",
+    "\n",
+    "Large Language Models face several critical challenges in effectively processing input data. While advances in long-context language models (LCLMs) {cite}`lee2024longcontextlanguagemodelssubsume` have expanded the amount of information these systems can process simultaneously, significant challenges remain in managing and effectively utilizing extended inputs. \n",
+    "\n",
+    "LLMs are sensitive to input formatting and structure, requiring careful data preparation to achieve optimal results {cite}`tan2024htmlraghtmlbetterplain`. They operate with knowledge cutoffs, providing potentially stale or outdated information that may not reflect current reality and demonstrate problems with temporal knowledge accuracy {cite}`amayuelas-etal-2024-knowledge`. LLMs also struggle with less common but important information showing a systematic loss of long-tail knowledge {cite}`kotha2024understanding`.\n",
+    "\n",
+    "Motivated by these challenges, this chapter explores two key components:\n",
+    "\n",
+    "1. Data Parsing: Parsing documents into a unified format that is suitable for LLMs to process.\n",
+    "2. Retrieval Augmentation: Augmenting LLMs with the ability to retrieve relevant, recent, and specialized information.\n",
+    "\n",
+    "In data parsing, we will explore some useful open source tools that help transform data into LLM-compatible formats, demonstrating their impact through a case study of structured information extraction from complex PDFs. In a second case study, we will introduce some chunking strategies to help LLMs process long inputs and implement a particular technique called Chunking with Contextual Linking the enables contextually relevant chunk processing.\n",
+    "\n",
+    "In retrieval augmentation, we will explore how to enhance LLMs with semantic search capabilities for incorporating external context using RAGs (Retrieval Augmented Generation). Through a detailed case study, we build a RAG system for querying live codebases, illustrating methods to bridge static model knowledge with dynamic information requirements.\n",
+    "\n",
+    "In our last case study, we build a quiz generator using a LLM with large context window. We will explore some additional relevant techniques such as prompt caching and response verification through citations.\n",
+    "\n",
+    "By the chapter's conclusion, readers will possess relevant knowledge of input data management strategies for LLMs and practical expertise in selecting and implementing appropriate approaches and tools for specific use cases."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Parsing Documents\n",
+    "\n",
+    "Building robust data ingestion and preprocessing pipelines is essential for any LLM application. This section explores tools and frameworks that streamline input data processing, in particular for parsing purposes, providing a unified interface for converting diverse data formats into standardized representations that LLMs can effectively process. By abstracting away format-specific complexities, they allow developers to focus on core application logic rather than parsing implementation details while maximizing the performance of the LLM.\n",
+    "\n",
+    "We will cover open source tools and frameworks that provide parsing capabilities for a wide range of data formats. And we will demonstrate how some of these tools can be used to extract structured information from complex PDFs also discussing how the quality of the parser can impact LLM's performance."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### MarkItDown\n",
+    "\n",
+    "MarkItDown is a Python package and CLI too developed by the Microsoft AutoGen team for converting various file formats to Markdown. It supports a wide range of formats including PDF, PowerPoint, Word, Excel, images (with OCR and EXIF metadata), audio (with transcription), HTML, and other text-based formats making it a useful tool for document indexing and LLM-based applications.\n",
+    "\n",
+    "Key features:\n",
+    "- Simple command-line and Python API interfaces\n",
+    "- Support for multiple file formats\n",
+    "- Optional LLM integration for enhanced image descriptions\n",
+    "- Batch processing capabilities\n",
+    "- Docker support for containerized usage\n",
+    "\n",
+    "Sample usage:\n",
+    "```python\n",
+    "from markitdown import MarkItDown\n",
+    "\n",
+    "md = MarkItDown()\n",
+    "result = md.convert(\"test.xlsx\")\n",
+    "print(result.text_content)\n",
+    "```\n",
+    "\n",
+    "### Docling\n",
+    "\n",
+    "Docling is a Python package developed by IBM Research for parsing and converting documents into various formats. It provides advanced document understanding capabilities with a focus on maintaining document structure and formatting.\n",
+    "\n",
+    "Key features:\n",
+    "- Support for multiple document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, etc.)\n",
+    "- Advanced PDF parsing including layout analysis and table extraction\n",
+    "- Unified document representation format\n",
+    "- Integration with LlamaIndex and LangChain\n",
+    "- OCR support for scanned documents\n",
+    "- Simple CLI interface\n",
+    "\n",
+    "Sample usage:\n",
+    "```python\n",
+    "from docling.document_converter import DocumentConverter\n",
+    "\n",
+    "converter = DocumentConverter()\n",
+    "result = converter.convert(\"document.pdf\")\n",
+    "print(result.document.export_to_markdown())\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Frameworks-Based Parsing\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Structured Data Extraction"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "A common use case where document parsing matters is to structured data extraction from documents, particularly in the presence of complex formatting and layout. In this case study, we will extract the economic forecasts from Merrill Lynch's CIO Capital Market Outlook released on December 16, 2024 {cite:p}`merrill2024`. We will focus on page 7 of this document, which contains several economic variables organized in a mix of tables, text and images (see {numref}`forecast`)\n",
+    "\n",
+    "\n",
+    "```{figure} ../data/input/forecast.png\n",
+    "---\n",
+    "name: forecast\n",
+    "alt: Forecast\n",
+    "scale: 50%\n",
+    "align: center\n",
+    "---\n",
+    "Forecast\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 76,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "FORECAST_FILE_PATH = \"../data/input/forecast.pdf\"\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "First, we will use MarkItDown to extract the text content from the document."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 83,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from markitdown import MarkItDown\n",
+    "\n",
+    "md = MarkItDown()\n",
+    "result_md = md.convert(FORECAST_FILE_PATH).text_content"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Next, we will do the same with Docling."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from docling.document_converter import DocumentConverter\n",
+    "\n",
+    "converter = DocumentConverter()\n",
+    "forecast_result_docling = converter.convert(source).document.export_to_markdown()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "How similar are the two results? We can use use Levenshtein distance to measure the similarity between the two results. We will also calculate a naive score using the `SequenceMatcher` from the `difflib` package, which is a simple measure of the similarity between two strings based on the number of matches in the longest common subsequence."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 79,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import Levenshtein\n",
+    "def levenshtein_similarity(text1: str, text2: str) -> float:\n",
+    "    \"\"\"\n",
+    "    Calculate normalized Levenshtein distance\n",
+    "    Returns value between 0 (completely different) and 1 (identical)\n",
+    "    \"\"\"\n",
+    "    distance = Levenshtein.distance(text1, text2)\n",
+    "    max_len = max(len(text1), len(text2))\n",
+    "    return 1 - (distance / max_len)\n",
+    "\n",
+    "from difflib import SequenceMatcher\n",
+    "def simple_similarity(text1: str, text2: str) -> float:\n",
+    "    \"\"\"\n",
+    "    Calculate similarity ratio using SequenceMatcher\n",
+    "    Returns value between 0 (completely different) and 1 (identical)\n",
+    "    \"\"\"\n",
+    "    return SequenceMatcher(None, text1, text2).ratio()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 80,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.13985705461925346"
+      ]
+     },
+     "execution_count": 80,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "levenshtein_similarity(forecast_result_md, forecast_result_docling)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 81,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.17779960707269155"
+      ]
+     },
+     "execution_count": 81,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "simple_similarity(forecast_result_md, forecast_result_docling)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "It turns out that the two results are quite different, with a similarity score of about 13.98% and 17.77% for Levenshtein and `SequenceMatcher` respectively."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Docling's result is a quite readable markdown displaying key economic variables and their forecasts. Conversely, MarkItDown's result is a bit messy and hard to read but the information is there just not in a structured format. Does it matter? That's what we will explore next."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Docling's result**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "display(Markdown(forecast_result_docling))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "{numref}`docling` shows part of the parsed result from Docling."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "```{figure} ../_static/input/docling.png\n",
+    "---\n",
+    "name: docling\n",
+    "alt: Docling's result\n",
+    "scale: 60%\n",
+    "align: center\n",
+    "---\n",
+    "Docling's parsed result\n",
+    "```\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**MarkItDown's result**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from IPython.display import display, Markdown\n",
+    "display(Markdown(forecast_result_md[:500]))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "{numref}`markitdown` shows part of the parsed result from MarkItDown.\n",
+    "\n",
+    "```{figure} ../_static/input/markitdown.png\n",
+    "---\n",
+    "name: markitdown\n",
+    "alt: MarkItDown's parsed result\n",
+    "scale: 60%\n",
+    "align: center\n",
+    "---\n",
+    "MarkItDown's parsed result\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now, let's focus on the economic forecasts. In particular, we are interested in extracting the CIO's 2025E forecasts.\n",
+    "\n",
+    "```{figure} ../_static/input/2025.png\n",
+    "---\n",
+    "name: forecast2025\n",
+    "alt: Forecast 2025\n",
+    "scale: 45%\n",
+    "align: center\n",
+    "---\n",
+    "Forecast 2025\n",
+    "```\n",
+    "\n",
+    "We will define a `Forecast` pydantic model to represent an economic forecast composed of a `financial_variable` and a `financial_forecast`. We will also define a `EconForecast` pydantic model to represent the list of economic forecasts we want to extract from the document.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pydantic import BaseModel\n",
+    "class Forecast(BaseModel):\n",
+    "    financial_variable: str\n",
+    "    financial_forecast: float\n",
+    "class EconForecast(BaseModel):\n",
+    "    forecasts: list[Forecast]\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We write a simple function to extract the economic forecasts from the document using an LLM model (with structured output) with the following prompt template, where `extract_prompt` is kind of data the user would like to extract and `doc` is the input document to analyze."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "```python\n",
+    "BASE_PROMPT = f\"\"\"\n",
+    "    ROLE: You are an expert at structured data extraction. \n",
+    "    TASK: Extract the following data {extract_prompt} from input DOCUMENT\n",
+    "    FORMAT: The output should be a JSON object with 'financial_variable' as key and 'financial_forecast' as value.\n",
+    "    \"\"\"\n",
+    "prompt = f\"{BASE_PROMPT} \\n\\n DOCUMENT: {doc}\"\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 84,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def extract_from_doc(extract_prompt: str,  doc: str, client) -> EconForecast:\n",
+    "    \"\"\"\n",
+    "    Extract data of a financial document using an LLM model.\n",
+    "    \n",
+    "    Args:\n",
+    "        doc: The financial document text to analyze\n",
+    "        client: The LLM model to use for analysis\n",
+    "        extract_prompt: The prompt to use for extraction\n",
+    "        \n",
+    "    Returns:\n",
+    "        EconForecasts object containing sentiment analysis results\n",
+    "    \"\"\"\n",
+    "\n",
+    "    BASE_PROMPT = f\"\"\"\n",
+    "    ROLE: You are an expert at structured data extraction. \n",
+    "    TASK: Extract the following data {extract_prompt} from input DOCUMENT\n",
+    "    FORMAT: The output should be a JSON object with 'financial_variable' as key and 'financial_forecast' as value.\n",
+    "    \"\"\"\n",
+    "    prompt = f\"{BASE_PROMPT} \\n\\n DOCUMENT: {doc}\"\n",
+    "    completion = client.beta.chat.completions.parse(\n",
+    "        model=\"gpt-4o-mini\",\n",
+    "        messages=[\n",
+    "            {\n",
+    "                \"role\": \"system\",\n",
+    "                \"content\": prompt\n",
+    "            },\n",
+    "            {\"role\": \"user\", \"content\": doc}\n",
+    "        ],\n",
+    "        response_format=EconForecast\n",
+    "    )\n",
+    "    return completion.choices[0].message.parsed"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from dotenv import load_dotenv\n",
+    "import os\n",
+    "\n",
+    "# Load environment variables from .env file\n",
+    "load_dotenv(override=True)\n",
+    "from openai import OpenAI\n",
+    "client = OpenAI()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The user then calls the `extract_from_doc` function simply defining that \"Economic Forecasts for 2025E\" is the data they would like to extract from the document. We perform the extraction twice, once with MarkItDown and once with Docling."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "extract_prompt = \"Economic Forecasts for 2025E\"\n",
+    "md_financials = extract_from_doc(extract_prompt, forecast_result_md, client)\n",
+    "docling_financials = extract_from_doc(extract_prompt, forecast_result_docling, client)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The response is an `EconForecast` object containing a list of `Forecast` objects, as defined in the pydantic model. We can then convert the response to a pandas DataFrame for easier comparison."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 99,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "EconForecast(forecasts=[Forecast(financial_variable='Real global GDP (% y/y annualized)', financial_forecast=3.2), Forecast(financial_variable='Real U.S. GDP (% q/q annualized)', financial_forecast=2.4), Forecast(financial_variable='CPI inflation (% y/y)', financial_forecast=2.5), Forecast(financial_variable='Core CPI inflation (% y/y)', financial_forecast=3.0), Forecast(financial_variable='Unemployment rate (%)', financial_forecast=4.3), Forecast(financial_variable='Fed funds rate, end period (%)', financial_forecast=3.88)])"
+      ]
+     },
+     "execution_count": 99,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "md_financials"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_md_forecasts = pd.DataFrame([(f.financial_variable, f.financial_forecast) for f in md_financials.forecasts], \n",
+    "                      columns=['Variable', 'Forecast'])\n",
+    "df_docling_forecasts = pd.DataFrame([(f.financial_variable, f.financial_forecast) for f in docling_financials.forecasts], \n",
+    "                      columns=['Variable', 'Forecast'])\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 97,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Variable</th>\n",
+       "      <th>Forecast</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Real global GDP (% y/y annualized)</td>\n",
+       "      <td>3.20</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Real U.S. GDP (% q/q annualized)</td>\n",
+       "      <td>2.40</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>CPI inflation (% y/y)</td>\n",
+       "      <td>2.50</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Core CPI inflation (% y/y)</td>\n",
+       "      <td>3.00</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Unemployment rate (%)</td>\n",
+       "      <td>4.30</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>Fed funds rate, end period (%)</td>\n",
+       "      <td>3.88</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                             Variable  Forecast\n",
+       "0  Real global GDP (% y/y annualized)      3.20\n",
+       "1    Real U.S. GDP (% q/q annualized)      2.40\n",
+       "2               CPI inflation (% y/y)      2.50\n",
+       "3          Core CPI inflation (% y/y)      3.00\n",
+       "4               Unemployment rate (%)      4.30\n",
+       "5      Fed funds rate, end period (%)      3.88"
+      ]
+     },
+     "execution_count": 97,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_md_forecasts"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 98,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Variable</th>\n",
+       "      <th>Forecast</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Real global GDP (% y/y annualized)</td>\n",
+       "      <td>3.20</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Real U.S. GDP (% q/q annualized)</td>\n",
+       "      <td>2.40</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>CPI inflation (% y/y)</td>\n",
+       "      <td>2.50</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Core CPI inflation (% y/y)</td>\n",
+       "      <td>3.00</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Unemployment rate (%)</td>\n",
+       "      <td>4.30</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>Fed funds rate, end period (%)</td>\n",
+       "      <td>3.88</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                             Variable  Forecast\n",
+       "0  Real global GDP (% y/y annualized)      3.20\n",
+       "1    Real U.S. GDP (% q/q annualized)      2.40\n",
+       "2               CPI inflation (% y/y)      2.50\n",
+       "3          Core CPI inflation (% y/y)      3.00\n",
+       "4               Unemployment rate (%)      4.30\n",
+       "5      Fed funds rate, end period (%)      3.88"
+      ]
+     },
+     "execution_count": 98,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_docling_forecasts"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The results from MarkItDown and Docling are identical and accurately match the true values from the document. This demonstrates that despite MarkItDown's output appearing less readable from a human perspective, both approaches enabled the LLM to successfully extract the economic forecast data with equal accuracy, in this particular case."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now, let's focus on the asset class weightings. We will extract the asset class weightings from the document and compare the results from MarkItDown and Docling. The information now is presented in a quite different structure. The CIO view information is represented in a spectrum from starting with \"Underweight\", passing through \"Neutral\" and reaching \"Overweight\". The actual view is marked by some colored dots in the chart. Let's see if we can extract this information from the document.\n",
+    "```{figure} ../_static/input/asset_class.png\n",
+    "---\n",
+    "name: asset_class\n",
+    "alt: Asset Class Weightings\n",
+    "scale: 50%\n",
+    "align: center\n",
+    "---\n",
+    "Asset Class Weightings\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The user will simply define the following data to extract: \"Asset Class Weightings (as of 12/3/2024) in a scale from -2 to 2\". In that way, we expect that \"Underweight\" will be mapped to -2, \"Neutral\" to 0 and \"Overweight\" to 2 with some values in between."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "extract_prompt = \"Asset Class Weightings (as of 12/3/2024) in a scale from -2 to 2\"\n",
+    "asset_class_docling = extract_from_doc(extract_prompt, forecast_result_docling, client)\n",
+    "asset_class_md = extract_from_doc(extract_prompt, forecast_result_md, client)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "df_md = pd.DataFrame([(f.financial_variable, f.financial_forecast) for f in asset_class_md.forecasts], \n",
+    "                 columns=['Variable', 'Forecast'])\n",
+    "df_docling = pd.DataFrame([(f.financial_variable, f.financial_forecast) for f in asset_class_docling.forecasts], \n",
+    "                 columns=['Variable', 'Forecast'])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now we construct a DataFrame to compare the results from MarkItDown and Docling with an added \"true_value\" column containing the true values from the document, which we extracted manually from the chart."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 72,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>variable</th>\n",
+       "      <th>markitdown</th>\n",
+       "      <th>docling</th>\n",
+       "      <th>true_value</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Global Equities</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>U.S. Large Cap Growth</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>U.S. Large Cap Value</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>U.S. Small Cap Growth</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>U.S. Small Cap Value</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>International Developed</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>-1.0</td>\n",
+       "      <td>-1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>Emerging Markets</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>Global Fixed Income</td>\n",
+       "      <td>-1.0</td>\n",
+       "      <td>-1.0</td>\n",
+       "      <td>-1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>U.S. Governments</td>\n",
+       "      <td>-1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>U.S. Mortgages</td>\n",
+       "      <td>-1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>U.S. Corporates</td>\n",
+       "      <td>-1.0</td>\n",
+       "      <td>-1.0</td>\n",
+       "      <td>-1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11</th>\n",
+       "      <td>International Fixed Income</td>\n",
+       "      <td>-1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>12</th>\n",
+       "      <td>High Yield</td>\n",
+       "      <td>-1.0</td>\n",
+       "      <td>-1.0</td>\n",
+       "      <td>-1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>13</th>\n",
+       "      <td>U.S. Investment-grade</td>\n",
+       "      <td>-1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>14</th>\n",
+       "      <td>Tax Exempt U.S. High Yield Tax Exempt</td>\n",
+       "      <td>-1.0</td>\n",
+       "      <td>-1.0</td>\n",
+       "      <td>-1.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                 variable  markitdown  docling  true_value\n",
+       "0                         Global Equities         1.0      1.0         1.0\n",
+       "1                   U.S. Large Cap Growth         1.0      1.0         0.0\n",
+       "2                    U.S. Large Cap Value         1.0      1.0         1.0\n",
+       "3                   U.S. Small Cap Growth         1.0      1.0         1.0\n",
+       "4                    U.S. Small Cap Value         1.0      1.0         1.0\n",
+       "5                 International Developed         1.0     -1.0        -1.0\n",
+       "6                        Emerging Markets         1.0      0.0         0.0\n",
+       "7                     Global Fixed Income        -1.0     -1.0        -1.0\n",
+       "8                        U.S. Governments        -1.0      1.0         1.0\n",
+       "9                          U.S. Mortgages        -1.0      1.0         1.0\n",
+       "10                        U.S. Corporates        -1.0     -1.0        -1.0\n",
+       "11             International Fixed Income        -1.0      0.0         0.0\n",
+       "12                             High Yield        -1.0     -1.0        -1.0\n",
+       "13                  U.S. Investment-grade        -1.0      0.0         0.0\n",
+       "14  Tax Exempt U.S. High Yield Tax Exempt        -1.0     -1.0        -1.0"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "# Create DataFrame with specified columns\n",
+    "df_comparison = pd.DataFrame({\n",
+    "    'variable': df_docling['Variable'].iloc[:-1],\n",
+    "    'markitdown': df_md['Forecast'],\n",
+    "    'docling': df_docling['Forecast'].iloc[:-1],  # Drop last row\n",
+    "    'true_value': [1.0, 0.0, 1.0, 1.0, 1.0, -1.0, 0.0, -1.0, 1.0, 1.0, -1.0, 0.0, -1.0, 0.0, -1.0]\n",
+    "})\n",
+    "\n",
+    "display(df_comparison)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 73,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Markitdown accuracy: 53.33%\n",
+      "Docling accuracy: 93.33%\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Calculate accuracy for markitdown and docling\n",
+    "markitdown_accuracy = (df_comparison['markitdown'] == df_comparison['true_value']).mean()\n",
+    "docling_accuracy = (df_comparison['docling'] == df_comparison['true_value']).mean()\n",
+    "\n",
+    "print(f\"Markitdown accuracy: {markitdown_accuracy:.2%}\")\n",
+    "print(f\"Docling accuracy: {docling_accuracy:.2%}\") \n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Docling performs significantly better at 93.33% accuracy missing only one value. MarkItDown achieves 53.33% accuracy, struggling with nuanced asset class weightings. In this case, Docling's structured parsed output did help the LLM to extract the information more accurately compared to MarkItDown's unstructured output. Hence, in this case, the strategy used to parse the data did impact the LLM's ability to extract the information. A more robust analysis would run data extraction on a large sample data a number of repeated runs to estimate error rates."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "What if we want to systematically extract all tables from the document? We can use Docling to do that by simply accessing the `tables` attribute of the `DocumentConverter` object.\n",
+    "\n",
+    "By doing that, we observe that Docling extracted 7 tables from the document. Exporting tables from top down and left to right in order of appearance in the document.\n",
+    "Below, we can see the first table successfully extracted for Equities forecasts, the second one for Fixed Income forecasts as well as the last table, which contains CIO Equity Sector Views.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 47,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import time\n",
+    "from pathlib import Path\n",
+    "import pandas as pd\n",
+    "from docling.document_converter import DocumentConverter"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 50,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def convert_and_export_tables(file_path: Path) -> list[pd.DataFrame]:\n",
+    "    \"\"\"\n",
+    "    Convert document and export tables to DataFrames.\n",
+    "    \n",
+    "    Args:\n",
+    "        file_path: Path to input document\n",
+    "        \n",
+    "    Returns:\n",
+    "        List of pandas DataFrames containing the tables\n",
+    "    \"\"\"\n",
+    "    doc_converter = DocumentConverter()\n",
+    "    start_time = time.time()\n",
+    "    \n",
+    "    conv_res = doc_converter.convert(file_path)\n",
+    "    \n",
+    "    tables = []\n",
+    "    # Export tables\n",
+    "    for table in conv_res.document.tables:\n",
+    "        table_df: pd.DataFrame = table.export_to_dataframe()\n",
+    "        tables.append(table_df)\n",
+    "\n",
+    "    end_time = time.time() - start_time\n",
+    "    print(f\"Document converted in {end_time:.2f} seconds.\")\n",
+    "    \n",
+    "    return tables\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Convert and export tables\n",
+    "tables = convert_and_export_tables(Path(FORECAST_FILE_PATH))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 100,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "7"
+      ]
+     },
+     "execution_count": 100,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(tables)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 59,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th>Total Return in USD (%).Current</th>\n",
+       "      <th>Total Return in USD (%).WTD</th>\n",
+       "      <th>Total Return in USD (%).MTD</th>\n",
+       "      <th>Total Return in USD (%).YTD</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>DJIA</td>\n",
+       "      <td>43,828.06</td>\n",
+       "      <td>-1.8</td>\n",
+       "      <td>-2.3</td>\n",
+       "      <td>18.4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>NASDAQ</td>\n",
+       "      <td>19,926.72</td>\n",
+       "      <td>0.4</td>\n",
+       "      <td>3.7</td>\n",
+       "      <td>33.7</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>S&amp;P 500</td>\n",
+       "      <td>6,051.09</td>\n",
+       "      <td>-0.6</td>\n",
+       "      <td>0.4</td>\n",
+       "      <td>28.6</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>S&amp;P 400 Mid Cap</td>\n",
+       "      <td>3,277.20</td>\n",
+       "      <td>-1.6</td>\n",
+       "      <td>-2.6</td>\n",
+       "      <td>19.5</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Russell 2000</td>\n",
+       "      <td>2,346.90</td>\n",
+       "      <td>-2.5</td>\n",
+       "      <td>-3.5</td>\n",
+       "      <td>17.3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>MSCI World</td>\n",
+       "      <td>3,817.24</td>\n",
+       "      <td>-1.0</td>\n",
+       "      <td>0.2</td>\n",
+       "      <td>22.1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>MSCI EAFE</td>\n",
+       "      <td>2,319.05</td>\n",
+       "      <td>-1.5</td>\n",
+       "      <td>0.2</td>\n",
+       "      <td>6.4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>MSCI Emerging Markets</td>\n",
+       "      <td>1,107.01</td>\n",
+       "      <td>0.3</td>\n",
+       "      <td>2.7</td>\n",
+       "      <td>10.6</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                         Total Return in USD (%).Current  \\\n",
+       "0                   DJIA                       43,828.06   \n",
+       "1                 NASDAQ                       19,926.72   \n",
+       "2                S&P 500                        6,051.09   \n",
+       "3        S&P 400 Mid Cap                        3,277.20   \n",
+       "4           Russell 2000                        2,346.90   \n",
+       "5             MSCI World                        3,817.24   \n",
+       "6              MSCI EAFE                        2,319.05   \n",
+       "7  MSCI Emerging Markets                        1,107.01   \n",
+       "\n",
+       "  Total Return in USD (%).WTD Total Return in USD (%).MTD  \\\n",
+       "0                        -1.8                        -2.3   \n",
+       "1                         0.4                         3.7   \n",
+       "2                        -0.6                         0.4   \n",
+       "3                        -1.6                        -2.6   \n",
+       "4                        -2.5                        -3.5   \n",
+       "5                        -1.0                         0.2   \n",
+       "6                        -1.5                         0.2   \n",
+       "7                         0.3                         2.7   \n",
+       "\n",
+       "  Total Return in USD (%).YTD  \n",
+       "0                        18.4  \n",
+       "1                        33.7  \n",
+       "2                        28.6  \n",
+       "3                        19.5  \n",
+       "4                        17.3  \n",
+       "5                        22.1  \n",
+       "6                         6.4  \n",
+       "7                        10.6  "
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "display(tables[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 102,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th>Total Return in USD (%).Current</th>\n",
+       "      <th>Total Return in USD (%).WTD</th>\n",
+       "      <th>Total Return in USD (%).MTD</th>\n",
+       "      <th>Total Return in USD (%).YTD</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Corporate &amp; Government</td>\n",
+       "      <td>4.66</td>\n",
+       "      <td>-1.34</td>\n",
+       "      <td>-0.92</td>\n",
+       "      <td>1.94</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Agencies</td>\n",
+       "      <td>4.54</td>\n",
+       "      <td>-0.58</td>\n",
+       "      <td>-0.31</td>\n",
+       "      <td>3.35</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Municipals</td>\n",
+       "      <td>3.55</td>\n",
+       "      <td>-0.87</td>\n",
+       "      <td>-0.54</td>\n",
+       "      <td>1.99</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>U.S. Investment Grade Credit</td>\n",
+       "      <td>4.79</td>\n",
+       "      <td>-1.38</td>\n",
+       "      <td>-0.93</td>\n",
+       "      <td>1.97</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>International</td>\n",
+       "      <td>5.17</td>\n",
+       "      <td>-1.40</td>\n",
+       "      <td>-0.90</td>\n",
+       "      <td>3.20</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>High Yield</td>\n",
+       "      <td>7.19</td>\n",
+       "      <td>-0.22</td>\n",
+       "      <td>0.20</td>\n",
+       "      <td>8.87</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>90 Day Yield</td>\n",
+       "      <td>4.32</td>\n",
+       "      <td>4.39</td>\n",
+       "      <td>4.49</td>\n",
+       "      <td>5.33</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>2 Year Yield</td>\n",
+       "      <td>4.24</td>\n",
+       "      <td>4.10</td>\n",
+       "      <td>4.15</td>\n",
+       "      <td>4.25</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>10 Year Yield</td>\n",
+       "      <td>4.40</td>\n",
+       "      <td>4.15</td>\n",
+       "      <td>4.17</td>\n",
+       "      <td>3.88</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>30 Year Yield</td>\n",
+       "      <td>4.60</td>\n",
+       "      <td>4.34</td>\n",
+       "      <td>4.36</td>\n",
+       "      <td>4.03</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                Total Return in USD (%).Current  \\\n",
+       "0        Corporate & Government                            4.66   \n",
+       "1                      Agencies                            4.54   \n",
+       "2                    Municipals                            3.55   \n",
+       "3  U.S. Investment Grade Credit                            4.79   \n",
+       "4                 International                            5.17   \n",
+       "5                    High Yield                            7.19   \n",
+       "6                  90 Day Yield                            4.32   \n",
+       "7                  2 Year Yield                            4.24   \n",
+       "8                 10 Year Yield                            4.40   \n",
+       "9                 30 Year Yield                            4.60   \n",
+       "\n",
+       "  Total Return in USD (%).WTD Total Return in USD (%).MTD  \\\n",
+       "0                       -1.34                       -0.92   \n",
+       "1                       -0.58                       -0.31   \n",
+       "2                       -0.87                       -0.54   \n",
+       "3                       -1.38                       -0.93   \n",
+       "4                       -1.40                       -0.90   \n",
+       "5                       -0.22                        0.20   \n",
+       "6                        4.39                        4.49   \n",
+       "7                        4.10                        4.15   \n",
+       "8                        4.15                        4.17   \n",
+       "9                        4.34                        4.36   \n",
+       "\n",
+       "  Total Return in USD (%).YTD  \n",
+       "0                        1.94  \n",
+       "1                        3.35  \n",
+       "2                        1.99  \n",
+       "3                        1.97  \n",
+       "4                        3.20  \n",
+       "5                        8.87  \n",
+       "6                        5.33  \n",
+       "7                        4.25  \n",
+       "8                        3.88  \n",
+       "9                        4.03  "
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "display(tables[1])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 61,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Sector</th>\n",
+       "      <th>CIO View.</th>\n",
+       "      <th>CIO View.Underweight</th>\n",
+       "      <th>CIO View.Neutral</th>\n",
+       "      <th>CIO View.</th>\n",
+       "      <th>CIO View.Overweight</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Utilities</td>\n",
+       "      <td>slight over weight green   </td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Financials</td>\n",
+       "      <td>slight over weight green   </td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Healthcare</td>\n",
+       "      <td>slight over weight green   </td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Consumer  Discretionary</td>\n",
+       "      <td>Slight over weight green  </td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Information  Technology</td>\n",
+       "      <td>Neutral yellow  </td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>Communication  Services</td>\n",
+       "      <td>Neutral yellow  </td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>Industrials</td>\n",
+       "      <td>Neutral yellow  </td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>Real Estate</td>\n",
+       "      <td>Neutral yellow  </td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>Energy</td>\n",
+       "      <td>slight underweight orange  </td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>Materials</td>\n",
+       "      <td>slight underweight orange  </td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>Consumer  Staples</td>\n",
+       "      <td>underweight red</td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                     Sector                     CIO View.  \\\n",
+       "0                 Utilities  slight over weight green      \n",
+       "1                Financials  slight over weight green      \n",
+       "2                Healthcare  slight over weight green      \n",
+       "3   Consumer  Discretionary   Slight over weight green     \n",
+       "4   Information  Technology             Neutral yellow     \n",
+       "5   Communication  Services             Neutral yellow     \n",
+       "6               Industrials             Neutral yellow     \n",
+       "7               Real Estate             Neutral yellow     \n",
+       "8                    Energy  slight underweight orange     \n",
+       "9                 Materials  slight underweight orange     \n",
+       "10        Consumer  Staples               underweight red   \n",
+       "\n",
+       "   CIO View.Underweight CIO View.Neutral CIO View. CIO View.Overweight  \n",
+       "0                                                                    \n",
+       "1                                                                    \n",
+       "2                                                                    \n",
+       "3                                                                    \n",
+       "4                                                                    \n",
+       "5                                                                    \n",
+       "6                                                                    \n",
+       "7                                                                    \n",
+       "8                                                                    \n",
+       "9                                                                    \n",
+       "10                                                                  "
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "display(tables[6])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Coming back to MarkItDown, one interesting feature to explore is the ability to extract information from images by passing an image capable LLM model to its constructor."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 55,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "md_llm = MarkItDown(llm_client=client, llm_model=\"gpt-4o-mini\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "result = md_llm.convert(\"../data/input/forecast.png\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Here's the description we obtain from the image of our input document."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 64,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/markdown": [
+       "\n",
+       "# Description:\n",
+       "**Markets in Review: Economic Forecasts and Asset Class Weightings (as of 12/13/2024)**\n",
+       "\n",
+       "This detailed market overview presents key performance metrics and economic forecasts as of December 13, 2024.\n",
+       "\n",
+       "**Equities Overview:**\n",
+       "- **Total Returns:** Highlights returns for major indices such as the DJIA (18.4% YTD), NASDAQ (33.7% YTD), and S&P 500 (28.6% YTD), showcasing strong performance across the board.\n",
+       "- **Forecasts:** Economic indicators reveal a projected real global GDP growth of 3.1%, with inflation rates expected to stabilize around 2.2% in 2025. Unemployment rates are anticipated to remain low at 4.4%.\n",
+       "\n",
+       "**Fixed Income:**\n",
+       "- Focuses on various segments, including Corporate & Government bonds, which offer an annualized return of 4.66% and indicate shifting trends in interest rates over 2-Year (4.25%) and 10-Year (4.03%) bonds.\n",
+       "\n",
+       "**Commodities & Currencies:**\n",
+       "- Commodities such as crude oil and gold show varied performance, with oil increasing by 4.8% and gold prices sitting at $2,648.23 per ounce.\n",
+       "- Currency metrics highlight the Euro and USD trends over the past year.\n",
+       "\n",
+       "**S&P Sector Returns:**\n",
+       "- A quick reference for sector performance indicates a significant 2.5% return in Communication Services, while other sectors like Consumer Staples and Materials display minor fluctuations.\n",
+       "\n",
+       "**CIO Asset Class Weightings:**\n",
+       "- Emphasizes strategic asset allocation recommendations which are crucial for an investor's portfolio. Underweight positions in U.S. Small Cap Growth and International Developed contrast with overweight positions in certain sectors such as Utilities and Financials, signaling tactical shifts based on ongoing economic assessments.\n",
+       "\n",
+       "**Note:** This summary is sourced from BofA Global Research and aims to provide a comprehensive view of current market conditions and forecasts to assist investors in making informed decisions.\n"
+      ],
+      "text/plain": [
+       "<IPython.core.display.Markdown object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "display(Markdown(result.text_content))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "\n",
+    "Overall, the description is somewhat accurate but contains a few inaccuracies including:\n",
+    "\n",
+    "- For the sector weightings, the description states there are \"underweight positions in U.S. Small Cap Growth\" but looking at the Asset Class Weightings chart, U.S. Small Cap Growth actually shows an overweight position (green circle).\n",
+    "- The description mentions \"overweight positions in certain sectors such as Utilities and Financials\" but looking at the CIO Equity Sector Views, both these sectors show neutral positions, not overweight positions.\n",
+    "- For fixed income, the description cites a \"10-Year (4.03%)\" yield, but the image shows the 30-Year Yield at 4.03%, while the 10-Year Yield is actually 4.40%.\n",
+    "\n",
+    "Arguably, the description's inaccuracies could be a consequence of the underlying LLM model's inability to process the image. Further research is needed to determine if this is the case."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Retrieval-Augmented Generation\n",
+    "\n",
+    "RAG is a technique that allows LLMs to retrieve information from a knowledge base to answer questions. It is a popular technique for building LLM applications that require knowledge-intensive tasks {cite}`lewis2021retrievalaugmentedgenerationknowledgeintensivenlp`.\n",
+    "\n",
+    "RAG utilizes a retrieval system to fetch external knowledge and augment the LLM. It has proved effective in mitigating hallucinations of LLMs {cite}`10.1145/3589334.3645481, ni-etal-2024-llms`."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Case Studies\n",
+    "\n",
+    "This section presents three case studies that demonstrate practical solutions to common LLM limitations:\n",
+    "\n",
+    "First, Content Chunking with Contextual Linking showcases how intelligent chunking strategies can overcome both context window and output token limitations. This case study illustrates techniques for breaking down and reassembling content while maintaining coherence, enabling the generation of high-quality long-form outputs despite model constraints.\n",
+    "\n",
+    "Second, a Retrieval Augmented Generation case study addresses the challenge of stale or outdated model knowledge. By implementing semantic search over a GitHub repository, this example demonstrates how to augment LLM responses with current, accurate information - allowing users to query and receive up-to-date answers about code repository contents.\n",
+    "\n",
+    "Third, the final case study builds a Quiz generator with citations. This case study explores some additional input management techniques that become particularly useful when long context window is available. This includes implementing prompt caching for efficiency and adding citations to enhance response accuracy and verifiability. These approaches show how to maximize the benefits of larger context models while maintaining response quality."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Case Study I: Content Chunking with Contextual Linking\n",
+    "\n",
+    "Content chunking with contextual linking is a technique to break down long-form content into smaller, manageable chunks while keeping chunk-specific context. This approach tackles three problems:\n",
+    "1. The LLM's inability to process long inputs to do context-size limits\n",
+    "2. The LLM's inability to generate long-form content due to the `max_output_tokens` limitation.\n",
+    "3. The LLM's inability to maintain coherence and context when generating responses per chunks\n",
+    "\n",
+    "Here, we exemplify this technique by following these steps:\n",
+    "1. **Chunking the Content**: The input content is split into smaller chunks. This allows the LLM to process each chunk individually, focusing on generating a complete and detailed response for that specific section of the input.\n",
+    "\n",
+    "2. **Maintaining Context**: Each chunk is linked with contextual information from the previous chunks. This helps in maintaining the flow and coherence of the content across multiple chunks.\n",
+    "\n",
+    "3. **Generating Linked Prompts**: For each chunk, a prompt is generated that includes the chunk's content and its context. This prompt is then used to generate the output for that chunk.\n",
+    "\n",
+    "4. **Combining the Outputs**: The outputs of all chunks are combined to form the final long-form content.\n",
+    "\n",
+    "Let's examine an example implementation of this technique.\n",
+    "\n",
+    "#### Generating long-form content\n",
+    "\n",
+    "- Goal: Generate a long-form report analyzing a company's financial statement.\n",
+    "- Input: A company's 10K SEC filing.\n",
+    "\n",
+    "```{figure} ../_static/structured_output/diagram1.png\n",
+    "---\n",
+    "name: content-chunking-with-contextual-linking\n",
+    "alt: Content Chunking with Contextual Linking\n",
+    "scale: 50%\n",
+    "align: center\n",
+    "---\n",
+    "Content Chunking with Contextual Linking Schematic Representation.\n",
+    "```\n",
+    "\n",
+    "The diagram in {numref}`content-chunking-with-contextual-linking` illustrates the process we will follow for handling long-form content generation with Large Language Models through \"Content Chunking with Contextual Linking.\" It shows how input content is first split into manageable chunks using a chunking function (e.g. `CharacterTextSplitter` with `tiktoken` tokenizer), then each chunk is processed sequentially while maintaining context from previous chunks. For each chunk, the system updates the context, generates a dynamic prompt with specific parameters, makes a call to the LLM chain, and stores the response. After all chunks are processed, the individual responses are combined with newlines to create the final report, effectively working around the token limit constraints of LLMs while maintaining coherence across the generated content.\n",
+    "\n",
+    "**Step 1: Chunking the Content**\n",
+    "\n",
+    "There are different methods for chunking, and each of them might be appropriate for different situations. However, we can broadly group chunking strategies in two types:\n",
+    "- **Fixed-size Chunking**: This is the most common and straightforward approach to chunking. We simply decide the number of tokens in our chunk and, optionally, whether there should be any overlap between them. In general, we will want to keep some overlap between chunks to make sure that the semantic context doesn’t get lost between chunks. Fixed-sized chunking may be a reasonable path in many common cases. Compared to other forms of chunking, fixed-sized chunking is computationally cheap and simple to use since it doesn’t require the use of any specialied techniques or libraries.\n",
+    "- **Content-aware Chunking**: These are a set of methods for taking advantage of the nature of the content we’re chunking and applying more sophisticated chunking to it. Examples include:\n",
+    "  - **Sentence Splitting**: Many models are optimized for embedding sentence-level content. Naturally, we would use sentence chunking, and there are several approaches and tools available to do this, including naive splitting (e.g. splitting on periods), NLTK, and spaCy.\n",
+    "  - **Recursive Chunking**: Recursive chunking divides the input text into smaller chunks in a hierarchical and iterative manner using a set of separators.\n",
+    "  - **Semantic Chunking**: This is a class of methods that leverages embeddings to extract the semantic meaning present in your data, creating chunks that are made up of sentences that talk about the same theme or topic.\n",
+    "\n",
+    "  Here, we will utilize `langchain` for a content-aware sentence-splitting strategy for chunking. Langchain offers several text splitters {cite}`langchain_text_splitters` such as JSON-, Markdown- and HTML-based or split by token. We will use the `CharacterTextSplitter` with `tiktoken` as our tokenizer to count the number of tokens per chunk which we can use to ensure that we do not surpass the input token limit of our model.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_chunks(text: str, chunk_size: int, chunk_overlap: int) -> list:\n",
+    "    \"\"\"\n",
+    "    Split input text into chunks of specified size with specified overlap.\n",
+    "\n",
+    "    Args:\n",
+    "        text (str): The input text to be chunked.\n",
+    "        chunk_size (int): The maximum size of each chunk in tokens.\n",
+    "        chunk_overlap (int): The number of tokens to overlap between chunks.\n",
+    "\n",
+    "    Returns:\n",
+    "        list: A list of text chunks.\n",
+    "    \"\"\"\n",
+    "    from langchain_text_splitters import CharacterTextSplitter\n",
+    "\n",
+    "    text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size, chunk_overlap=chunk_overlap)\n",
+    "    return text_splitter.split_text(text)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Step 2: Writing the Base Prompt Template**\n",
+    "\n",
+    "We will write a base prompt template which will serve as a foundational structure for all chunks, ensuring consistency in the instructions and context provided to the language model. The template includes the following parameters:\n",
+    "- `role`: Defines the role or persona the model should assume.\n",
+    "- `context`: Provides the background information or context for the task.\n",
+    "- `instruction`: Specifies the task or action the model needs to perform.\n",
+    "- `input_text`: Contains the actual text input that the model will process.\n",
+    "- `requirements`: Lists any specific requirements or constraints for the output."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain_core.prompts import PromptTemplate\n",
+    "def get_base_prompt_template() -> str:\n",
+    "    \n",
+    "    base_prompt = \"\"\"\n",
+    "    ROLE: {role}\n",
+    "    CONTEXT: {context}\n",
+    "    INSTRUCTION: {instruction}\n",
+    "    INPUT: {input}\n",
+    "    REQUIREMENTS: {requirements}\n",
+    "    \"\"\"\n",
+    "    \n",
+    "    prompt = PromptTemplate.from_template(base_prompt)\n",
+    "    return prompt"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We will write a simple function that returns an `LLMChain` which is a simple `langchain` construct that allows you to chain together a combination of prompt templates, language models and output parsers."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain_core.output_parsers import StrOutputParser\n",
+    "from langchain_community.chat_models import ChatLiteLLM\n",
+    "\n",
+    "def get_llm_chain(prompt_template: str, model_name: str, temperature: float = 0):\n",
+    "    \"\"\"\n",
+    "    Returns an LLMChain instance using langchain.\n",
+    "\n",
+    "    Args:\n",
+    "        prompt_template (str): The prompt template to use.\n",
+    "        model_name (str): The name of the model to use.\n",
+    "        temperature (float): The temperature setting for the model.\n",
+    "\n",
+    "    Returns:\n",
+    "        llm_chain: An instance of the LLMChain.\n",
+    "    \"\"\"\n",
+    "    \n",
+    "    from dotenv import load_dotenv\n",
+    "    import os\n",
+    "\n",
+    "    # Load environment variables from .env file\n",
+    "    load_dotenv()\n",
+    "    \n",
+    "    api_key_label = model_name.split(\"/\")[0].upper() + \"_API_KEY\"\n",
+    "    llm = ChatLiteLLM(\n",
+    "        model=model_name,\n",
+    "        temperature=temperature,\n",
+    "        api_key=os.environ[api_key_label],\n",
+    "    )\n",
+    "    llm_chain = prompt_template | llm | StrOutputParser()\n",
+    "    return llm_chain"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Step 3: Constructing Dynamic Prompt Parameters**\n",
+    "\n",
+    "Now, we will write a function (`get_dynamic_prompt_template`) that constructs prompt parameters dynamically for each chunk."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from typing import Dict\n",
+    "def get_dynamic_prompt_params(prompt_params: Dict, \n",
+    "                            part_idx: int, \n",
+    "                            total_parts: int,\n",
+    "                            chat_context: str,\n",
+    "                            chunk: str) -> str:\n",
+    "    \"\"\"\n",
+    "    Construct prompt template dynamically per chunk while maintaining the chat context of the response generation.\n",
+    "    \n",
+    "    Args:\n",
+    "        prompt_params (Dict): Original prompt parameters\n",
+    "        part_idx (int): Index of current conversation part\n",
+    "        total_parts (int): Total number of conversation parts\n",
+    "        chat_context (str): Chat context from previous parts\n",
+    "        chunk (str): Current chunk of text to be processed\n",
+    "    Returns:\n",
+    "        str: Dynamically constructed prompt template with part-specific params\n",
+    "    \"\"\"\n",
+    "    dynamic_prompt_params = prompt_params.copy()\n",
+    "    # saves the chat context from previous parts\n",
+    "    dynamic_prompt_params[\"context\"] = chat_context\n",
+    "    # saves the current chunk of text to be processed as input\n",
+    "    dynamic_prompt_params[\"input\"] = chunk\n",
+    "    \n",
+    "    # Add part-specific instructions\n",
+    "    if part_idx == 0: # Introduction part\n",
+    "        dynamic_prompt_params[\"instruction\"] = f\"\"\"\n",
+    "        You are generating the Introduction part of a long report.\n",
+    "        Don't cover any topics yet, just define the scope of the report.\n",
+    "        \"\"\"\n",
+    "    elif part_idx == total_parts - 1: # Conclusion part\n",
+    "        dynamic_prompt_params[\"instruction\"] = f\"\"\"\n",
+    "        You are generating the last part of a long report. \n",
+    "        For this part, first discuss the below INPUT. Second, write a \"Conclusion\" section summarizing the main points discussed given in CONTEXT.\n",
+    "        \"\"\"\n",
+    "    else: # Main analysis part\n",
+    "        dynamic_prompt_params[\"instruction\"] = f\"\"\"\n",
+    "        You are generating part {part_idx+1} of {total_parts} parts of a long report.\n",
+    "        For this part, analyze the below INPUT.\n",
+    "        Organize your response in a way that is easy to read and understand either by creating new or merging with previously created structured sections given in CONTEXT.\n",
+    "        \"\"\"\n",
+    "    \n",
+    "    return dynamic_prompt_params"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "**Step 4: Generating the Report**\n",
+    "\n",
+    "Finally, we will write a function that generates the actual report by calling the `LLMChain` with the dynamically updated prompt parameters for each chunk and concatenating the results at the end."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def generate_report(input_content: str, llm_model_name: str, \n",
+    "                    role: str, requirements: str,\n",
+    "                    chunk_size: int, chunk_overlap: int) -> str:\n",
+    "    # stores the parts of the report, each generated by an individual LLM call\n",
+    "    report_parts = [] \n",
+    "    # split the input content into chunks\n",
+    "    chunks = get_chunks(input_content, chunk_size, chunk_overlap)\n",
+    "    # initialize the chat context with the input content\n",
+    "    chat_context = input_content\n",
+    "    # number of parts to be generated\n",
+    "    num_parts = len(chunks)\n",
+    "\n",
+    "    prompt_params = {\n",
+    "        \"role\": role, # user-provided\n",
+    "        \"context\": \"\", # dinamically updated per part\n",
+    "        \"instruction\": \"\", # dynamically updated per part\n",
+    "        \"input\": \"\", # dynamically updated per part\n",
+    "        \"requirements\": requirements #user-priovided\n",
+    "    }\n",
+    "\n",
+    "    # get the LLMChain with the base prompt template\n",
+    "    llm_chain = get_llm_chain(get_base_prompt_template(), \n",
+    "                                 llm_model_name)\n",
+    "\n",
+    "    # dynamically update prompt_params per part\n",
+    "    print(f\"Generating {num_parts} report parts\")\n",
+    "    for i, chunk in enumerate(chunks):\n",
+    "        dynamic_prompt_params = get_dynamic_prompt_params(\n",
+    "            prompt_params,\n",
+    "            part_idx=i,\n",
+    "            total_parts=num_parts,\n",
+    "            chat_context=chat_context,\n",
+    "            chunk=chunk\n",
+    "        )\n",
+    "        \n",
+    "        # invoke the LLMChain with the dynamically updated prompt parameters\n",
+    "        response = llm_chain.invoke(dynamic_prompt_params)\n",
+    "\n",
+    "        # update the chat context with the cummulative response\n",
+    "        if i == 0:\n",
+    "            chat_context = response\n",
+    "        else:\n",
+    "            chat_context = chat_context + response\n",
+    "            \n",
+    "        print(f\"Generated part {i+1}/{num_parts}.\")\n",
+    "        report_parts.append(response)\n",
+    "\n",
+    "    report = \"\\n\".join(report_parts)\n",
+    "    return report"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Example Usage**\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load the text from sample 10K SEC filing\n",
+    "with open('../data/apple.txt', 'r') as file:\n",
+    "    text = file.read()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Define the chunk and chunk overlap size\n",
+    "MAX_CHUNK_SIZE = 10000\n",
+    "MAX_CHUNK_OVERLAP = 0"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "report = generate_report(text, llm_model_name=\"gemini/gemini-1.5-flash-latest\", \n",
+    "                           role=\"Financial Analyst\", \n",
+    "                           requirements=\"The report should be in a readable, structured format, easy to understand and follow. Focus on finding risk factors and market moving insights.\",\n",
+    "                           chunk_size=MAX_CHUNK_SIZE, \n",
+    "                           chunk_overlap=MAX_CHUNK_OVERLAP)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Save the generated report to a local file\n",
+    "with open('data/apple_report.txt', 'w') as file:\n",
+    "    file.write(report)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 109,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/markdown": [
+       "**Introduction**\n",
+       "\n",
+       "This report provides a comprehensive analysis of Apple Inc.'s financial performance and position for the fiscal year ended September 28, 2024, as disclosed in its Form 10-K filing with the United States Securities and Exchange Commission.  The analysis will focus on identifying key risk factors impacting Apple's business, evaluating its financial health, and uncovering market-moving insights derived from the provided data.  The report will delve into Apple's various segments, product lines, and services, examining their performance and contributions to overall financial results.  Specific attention will be paid to identifying trends, potential challenges, and opportunities for future growth.  The analysis will also consider the broader macroeconomic environment and its influence on Apple's operations and financial outlook.  Finally, the report will incorporate relevant information from Apple's definitive proxy statement for its 2025 annual meeting of shareholders, as incorporated by reference in the Form 10-K.\n",
+       "\n",
+       "**PART 2: Key Risk Factors and Market-Moving Insights**\n",
+       "\n",
+       "This section analyzes key risk factors disclosed in Apple Inc.'s 2024 Form 10-K, focusing on their potential impact on financial performance and identifying potential market-moving insights.  The analysis is structured around the major risk categories identified in the filing.\n",
+       "\n",
+       "**2.1 Dependence on Third-Party Developers:**\n",
+       "\n",
+       "Apple's success is heavily reliant on the continued support and innovation of third-party software developers.  The Form 10-K highlights several critical aspects of this dependence:\n",
+       "\n",
+       "* **Market Share Vulnerability:** Apple's relatively smaller market share in smartphones, personal computers, and tablets compared to competitors (Android, Windows, gaming consoles) could discourage developers from prioritizing Apple's platform, leading to fewer high-quality apps and potentially impacting customer purchasing decisions.  This is a significant risk, especially given the rapid pace of technological change.  A decline in app availability or quality could negatively impact sales and market share.  **Market-moving insight:**  Monitoring developer activity and app quality across competing platforms is crucial for assessing this risk.  Any significant shift in developer focus away from iOS could be a negative market signal.\n",
+       "\n",
+       "* **App Store Dynamics:** While Apple allows developers to retain most App Store revenue, its commission structure and recent changes (e.g., complying with the Digital Markets Act (DMA) in the EU) introduce uncertainty.  Changes to the App Store's policies or fee structures could materially affect Apple's revenue and profitability.  **Market-moving insight:**  Closely monitoring regulatory developments (especially concerning the DMA) and their impact on App Store revenue is essential.  Any significant changes to Apple's App Store policies or revenue streams could trigger market reactions.\n",
+       "\n",
+       "* **Content Acquisition and Creation:** Apple's reliance on third-party digital content providers for its services introduces risks related to licensing agreements, competition, and pricing.  The cost of producing its own digital content is also increasing due to competition for talent and subscribers.  Failure to secure or create appealing content could negatively impact user engagement and revenue.  **Market-moving insight:**  Analyzing the success of Apple's original content initiatives and the renewal rates of third-party content agreements will provide insights into this risk.\n",
+       "\n",
+       "**2.2 Operational Risks:**\n",
+       "\n",
+       "\n",
+       " (...) \n",
+       "\n",
+       " The reconciliation of segment operating income to consolidated operating income reveals that research and development (R&D) and other corporate expenses significantly impact overall profitability.  While increased R&D is generally positive, it reduces short-term profits.  The geographical breakdown of net sales and long-lived assets further emphasizes the concentration of Apple's business in the U.S. and China.  **Market-moving insight:**  Continued weakness in the Greater China market, sustained flat iPhone sales, or any significant changes in R&D spending should be closely monitored for their potential impact on Apple's financial performance and investor sentiment.\n",
+       "\n",
+       "\n",
+       "**5.4 Auditor's Report and Internal Controls:**\n",
+       "\n",
+       "The auditor's report expresses an unqualified opinion on Apple's financial statements and internal control over financial reporting.  However, it identifies uncertain tax positions as a critical audit matter.  The significant amount of unrecognized tax benefits ($22.0 billion) and the complexity involved in evaluating these positions highlight a substantial risk.  Management's assessment of these positions involves significant judgment and relies on interpretations of complex tax laws.  Apple's management also asserts that its disclosure controls and procedures are effective.  **Market-moving insight:**  Any changes in tax laws, unfavorable rulings on uncertain tax positions, or weaknesses in internal controls could materially affect Apple's financial results and investor confidence.\n",
+       "\n",
+       "\n",
+       "**Conclusion**\n",
+       "\n",
+       "This report provides a comprehensive analysis of Apple Inc.'s financial performance and position for fiscal year 2024.  While Apple maintains a strong financial position with substantial cash reserves and a robust capital return program, several key risk factors could significantly impact its future performance.  These risks include:\n",
+       "\n",
+       "* **Dependence on third-party developers:**  A shift in developer focus away from iOS or changes to the App Store's policies could negatively impact Apple's revenue and profitability.\n",
+       "* **Operational risks:**  Employee retention challenges, reseller dependence, and cybersecurity threats pose significant operational risks.\n",
+       "* **Legal and regulatory risks:**  Ongoing antitrust litigation, the Digital Markets Act (DMA) compliance, and data privacy regulations introduce substantial legal and regulatory uncertainties.\n",
+       "* **Financial risks:**  Volatility in sales and profit margins, foreign exchange rate fluctuations, credit risk, and tax risks could impact Apple's financial performance.\n",
+       "* **Supply chain concentration:**  Apple's reliance on a concentrated network of outsourcing partners, primarily located in a few Asian countries, and dependence on single or limited sources for certain custom components, exposes the company to significant supply chain risks.\n",
+       "* **Uncertain tax positions:**  The significant amount of unrecognized tax benefits represents a substantial uncertainty that could materially affect Apple's financial results.\n",
+       "\n",
+       "Despite these risks, Apple's strong liquidity position, continued growth in its Services segment, and robust capital return program provide a degree of resilience.  However, investors and analysts should closely monitor the market-moving insights identified throughout this report, including developer activity, regulatory developments, regional economic conditions, supply chain stability, and the resolution of uncertain tax positions, to assess their potential impact on Apple's future performance and valuation.  The significant short-term obligations, while manageable given Apple's cash position, highlight the need for continued financial discipline and effective risk management.  A deeper, more granular analysis of the financial statements and notes is recommended for a more complete assessment."
+      ],
+      "text/plain": [
+       "<IPython.core.display.Markdown object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "# Read and display the generated report\n",
+    "with open('../data/apple_report.txt', 'r') as file:\n",
+    "    report_content = file.read()\n",
+    "    \n",
+    "from IPython.display import Markdown\n",
+    "\n",
+    "# Display first and last 10% of the report content\n",
+    "report_lines = report_content.splitlines()\n",
+    "total_lines = len(report_lines)\n",
+    "quarter_lines = total_lines // 10\n",
+    "\n",
+    "top_portion = '\\n'.join(report_lines[:quarter_lines])\n",
+    "bottom_portion = '\\n'.join(report_lines[-quarter_lines:])\n",
+    "\n",
+    "display(Markdown(f\"{top_portion}\\n\\n (...) \\n\\n {bottom_portion}\"))\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "---"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Discussion\n",
+    "\n",
+    "Results from the generated report present a few interesting aspects:\n",
+    "\n",
+    "- **Coherence**: The generated report demonstrates an apparent level of coherence. The sections are logically structured, and the flow of information is smooth. Each part of the report builds upon the previous sections, providing a comprehensive analysis of Apple Inc.'s financial performance and key risk factors. The use of headings and subheadings helps in maintaining clarity and organization throughout the document.\n",
+    "\n",
+    "- **Adherence to Instructions**: The LLM followed the provided instructions effectively. The report is in a readable, structured format, and it focuses on identifying risk factors and market-moving insights as requested. The analysis is detailed and covers various aspects of Apple's financial performance, including revenue segmentation, profitability, liquidity, and capital resources. The inclusion of market-moving insights adds value to the report, aligning with the specified requirements.\n",
+    "\n",
+    "Despite the seemingly good quality of the results, there are some limitations to consider:\n",
+    "\n",
+    "- **Depth of Analysis**: While the report covers a wide range of topics, the depth of analysis in certain sections may not be as comprehensive as a human expert's evaluation. Some nuances and contextual factors might be overlooked by the LLM. Splitting the report into multiple parts helps in mitigating this issue.\n",
+    "\n",
+    "- **Chunking Strategy**: The current approach splits the text into chunks based on size, which ensures that each chunk fits within the model's token limit. However, this method may disrupt the logical flow of the document, as sections of interest might be split across multiple chunks. An alternative approach could be \"structured\" chunking, where the text is divided based on meaningful sections or topics. This would preserve the coherence of each section, making it easier to follow and understand. Implementing structured chunking requires additional preprocessing to identify and segment the text appropriately, but it can significantly enhance the readability and logical flow of the generated report.\n",
+    "\n",
+    "Here, we implemented a simple strategy to improve the coherence in output generation given a multi-part chunked input. Many other strategies are possible. One related technique worth mentioning is Anthropic's Contextual Retrieval {cite}`anthropic2024contextualretrieval`. The approach, as shown in {numref}`anth_contextual`, employs an LLM itself to generate relevant context per chunk before passing these two pieces of information together to the LLM. This process was proposed in the context of RAGs to enhance its retrieval capabilities but can be applied more generally to improve output generation.\n",
+    "```{figure} ../_static/input/anth_contextual.png\n",
+    "---\n",
+    "name: anth_contextual\n",
+    "alt: Anthropic Contextual Linking\n",
+    "scale: 50%\n",
+    "align: center\n",
+    "---\n",
+    "Anthropic Contextual Linking {cite}`anthropic2024contextualretrieval`.\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Case Study II: Github RAG\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Case Study III: Quiz Generation with Citations\n",
+    "\n",
+    "In this case study, we will build a Quiz generator with citations that explores additional input management techniques particularly useful with long context windows. The implementation includes prompt caching for efficiency and citation tracking to enhance accuracy and verifiability. We will use Gemini 1.5 Pro as our LLM model, which has a context window of 2M tokens.\n",
+    "\n",
+    "#### Use Case\n",
+    "\n",
+    "Let's assume you are a Harvard student enrolled in GOV 1039 \"The Birth of Modern Democracy\" (see {numref}`harvard-class`), you face a daunting reading list for next Tuesday's class on Rights. The readings include foundational documents like the Magna Carta, Declaration of Independence, and US Bill of Rights, each with specific sections to analyze.\n",
+    "\n",
+    "```{figure} ../_static/input/harvard.png\n",
+    "---\n",
+    "name: harvard-class\n",
+    "alt: Harvard Class\n",
+    "scale: 50%\n",
+    "align: center\n",
+    "---\n",
+    "Harvard's Democratic Theory Class\n",
+    "```\n",
+    "\n",
+    "Instead of trudging through these dense historical texts sequentially, we would like to:\n",
+    "- Extract key insights and connections between these documents, conversationally.\n",
+    "- Engage with the material through a quiz format.\n",
+    "- Add citations to help with verifying answers.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Implementation\n",
+    "\n",
+    "The full implementation is available at Book's [Github repository](https://github.com/souzatharsis/tamingLLMs/tamingllms/notebooks/src/gemini_duo.py). Here, we will cover the most relevant parts of the implementation.\n",
+    "\n",
+    "**Client Class**\n",
+    "\n",
+    "First, we will define the `Client` class which will provide the key interface users will interact with. It has the following summarized interface:\n",
+    "\n",
+    "- Initialization:\n",
+    "    - `__init__(knowledge_base: List[str] = [])`: Initialize with optional list of URLs as knowledge base\n",
+    "\n",
+    "- Core Methods:\n",
+    "    - `add_knowledge_base(urls: List[str]) -> None`: Add URLs to the knowledge base\n",
+    "    - `add(urls: List[str]) -> None`: Extract content from URLs and add to conversation input\n",
+    "    - `msg(msg: str = \"\", add_citations: bool = False) -> str`: Enables users to send messages to the client\n",
+    "    - `quiz(add_citations: bool = True, num_questions: int = 10) -> str`: Generate a quiz based on full input memory\n",
+    "\n",
+    "- Key Attributes:\n",
+    "    - `knowledge_base`: List of URLs providing foundation knowledge\n",
+    "    - `input`: Current input being studied (short-term memory)\n",
+    "    - `input_memory`: Cumulative input + knowledge base (long-term memory) \n",
+    "    - `response`: Latest response from LLM\n",
+    "    - `response_memory`: Cumulative responses (long-term memory)\n",
+    "    - `urls_memory`: Cumulative list of processed URLs\n",
+    "\n",
+    "\n",
+    "**Corpus-in-Context Prompting**\n",
+    "\n",
+    "The `add()` method is key since it is used to add content to the client. It takes a list of URLs and extracts the content from each URL using a content extractor (using MarkitDown). The content is then added to the conversation input memory in a way that enables citations using the \"Corpus-in-Context\" (CIC) Prompting {cite}`lee2024longcontextlanguagemodelssubsume`.\n",
+    "\n",
+    "{numref}`cic` shows how CIC format is used to enable citations. It inserts a corpus into the prompt. Each candidate citable part (e.g., passage, chapter) in a corpus is assigned a unique identifier (ID) that can be referenced as needed for that task.\n",
+    "\n",
+    "```{figure} ../_static/input/cic.png\n",
+    "---\n",
+    "name: cic\n",
+    "alt: CIC Format\n",
+    "scale: 50%\n",
+    "align: center\n",
+    "---\n",
+    "Example of Corpus-in-Context Prompting for retrieval. \n",
+    "```\n",
+    "\n",
+    "CiC prompting leverages LLM's capacity to follow instructions by carefully annotating the corpus with document IDs. It benefits from a strong, capable models to retrieve over large corpora provided in context. \n",
+    "\n",
+    "```python\n",
+    "    def add(self, urls: List[str]) -> None:\n",
+    "        self.urls = urls\n",
+    "\n",
+    "        # Add new content to input following CIC format to enable citations\n",
+    "        for url in urls:\n",
+    "            self.urls_memory.append(url)\n",
+    "            content = self.extractor.convert(url).text_content\n",
+    "            formatted_content = f\"ID: {self.reference_id} | {content} | END ID: {self.reference_id}\"\n",
+    "            self.input += formatted_content + \"\\n\" \n",
+    "            self.reference_id += 1\n",
+    "        \n",
+    "        # Update memory\n",
+    "        self.input_memory = self.input_memory + self.input\n",
+    "```\n",
+    "\n",
+    "The method `add_knowledge_base()` is a simple wrapper around the `add()` method. It is used to add URLs to the knowledge base, which are later cached by the LLM model as we will see later.\n",
+    "\n",
+    "```python\n",
+    "    def add_knowledge_base(self, urls: List[str]) -> None:\n",
+    "        self.add(urls)\n",
+    "```\n",
+    "\n",
+    "\n",
+    "Later, when the user sends a message to the client, the `msg()` method is used to generate a response  while enabling citations. `self.content_generator` is an instance of our LLM model, which we will go through next.\n",
+    "\n",
+    "```python\n",
+    "    def msg(self, msg: str = \"\", add_citations: bool = False) -> str:\n",
+    "        if add_citations:\n",
+    "            msg = msg + \"\\n\\n For key statements, add Input ID to the response.\"\n",
+    "\n",
+    "        self.response = self.content_generator.generate(\n",
+    "            input_content=self.input,\n",
+    "            user_instructions=msg\n",
+    "        )\n",
+    "\n",
+    "        self.response_memory = self.response_memory + self.response.text\n",
+    "\n",
+    "        return self.response.text\n",
+    "```\n",
+    "\n",
+    "**Prompt Caching**\n",
+    "\n",
+    "LLM-based applications often involve repeatedly passing the same input tokens to a model, which can be inefficient and costly. Context caching addresses this by allowing you to cache input tokens after their first use and reference them in subsequent requests. This approach significantly reduces costs compared to repeatedly sending the same token corpus, especially at scale.\n",
+    "\n",
+    "In our application, the user might passes a large knowledge base to the client that can be referenced multiple times by smaller user requests. Our `Client` class is composed of a `LLMBackend` class that takes the `input_memory` containing the entire knowledge base and any additional user added content.\n",
+    "```python\n",
+    "self.llm = LLMBackend(input=self.input_memory)\n",
+    "```\n",
+    "\n",
+    "In our `LLMBackend` Class, we leverage prompt caching on input tokens and uses them for subsequent requests.\n",
+    "\n",
+    "```python\n",
+    "class LLMBackend:\n",
+    "    def __init__(self, model_name: str, input: str, cache_ttl: int = 60):\n",
+    "        self.cache = caching.CachedContent.create(\n",
+    "            model=model_name,\n",
+    "            display_name='due_knowledge_base', # used to identify the cache\n",
+    "            system_instruction=(\n",
+    "            self.compose_prompt(input, conversation_config)\n",
+    "        ),\n",
+    "        ttl=datetime.timedelta(minutes=cache_ttl),\n",
+    "    )\n",
+    "\n",
+    "    self.model = genai.GenerativeModel.from_cached_content(cached_content=self.cache)\n",
+    "```\n",
+    "\n",
+    "**Quiz Generation**\n",
+    "\n",
+    "Coming back to our `Client` class, we implement the `quiz()` method to generate a quiz based on the full input memory, i.e. the initial knowledge base and any additional user added content.\n",
+    "\n",
+    "The `quiz()` method returns a `Quiz` instance which behind the scenes caches input tokens. The user later can invoke its `generate()` method to generate a quiz passing the user instructions in `msg` parameter, as we will see later.\n",
+    "\n",
+    "```python\n",
+    "    def quiz(self, add_citations: bool = True, num_questions: int = 10) -> str:\n",
+    "        \"\"\"\n",
+    "        Returns a quiz instance based on full input memory.\n",
+    "        \"\"\"\n",
+    "        self.quiz_instance = Quiz(\n",
+    "                         input=self.input_memory,\n",
+    "                         add_citations=add_citations,\n",
+    "                         num_questions=num_questions)\n",
+    "        return self.quiz_instance\n",
+    "```\n",
+    "\n",
+    "We write a simple prompt template for quiz generation:\n",
+    "\n",
+    "> ROLE:\n",
+    "> - You are a Harvard Professor providing a quiz.\n",
+    "> INSTRUCTIONS:\n",
+    "> - Generate a quiz with {num_questions} questions based on the input.\n",
+    "> - The quiz should be multi-choice.\n",
+    "> - Answers should be provided at the end of the quiz.\n",
+    "> - Questions should have broad coverage of the input including multiple Input IDs.\n",
+    "> - Level of difficulty is advanced/hard.\n",
+    "> - {{citations}}\n",
+    ">\n",
+    "> STRUCTURE:\n",
+    "> - Sequence of questions and alternatives.\n",
+    "> - At the end provide the correct answers.\n",
+    "\n",
+    "where, `{citations}` instructs the model to add CiC citations to the response if user requests it."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Example Usage\n",
+    "\n",
+    "\n",
+    "**Dataset**\n",
+    "\n",
+    "First, we will define our knowledge base. \n",
+    "\n",
+    "- Harvard Class: [GOV 1039 Syllabus](https://scholar.harvard.edu/files/dlcammack/files/gov_1039_syllabus.pdf)\n",
+    "- Class / Topic: \"Rights\"\n",
+    "- Reading List:\n",
+    "    - ID 1. The Declaration of Independence of the United States of America\n",
+    "    - ID 2. The United States Bill of Rights\n",
+    "    - ID 3. John F. Kennedy's Inaugural Address\n",
+    "    - ID 4. Lincoln's Gettysburg Address\n",
+    "    - ID 5. The United States Constitution\n",
+    "    - ID 6. Give Me Liberty or Give Me Death\n",
+    "    - ID 7. The Mayflower Compact\n",
+    "    - ID 8. Abraham Lincoln's Second Inaugural Address\n",
+    "    - ID 9. Abraham Lincoln's First Inaugural Address\n",
+    "\n",
+    "We will take advantage of Project Gutenberg's to create our knowledge base."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "kb = [f\"https://www.gutenberg.org/cache/epub/{i}/pg{i}.txt\" for i in range(1,9)]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We will import our module `gemini_duo` as `genai_duo` and initialize the `Client` class with our knowledge base."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import gemini_duo as genai_duo\n",
+    "from IPython.display import Markdown, display"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "duo = genai_duo.Client(knowledge_base=kb)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "At this point, we converted each book into markdown using MarkitDown and cached the content in our LLM model. We can access how many tokens we have cached in our LLM model by looking at the `usage_metadata` attribute of the Gemini's model response. At this point, we have cached at total of 38470 tokens.\n",
+    "\n",
+    "Now, we can add references to our knowledge base at anytime by calling the `add()` method. We add the following references:\n",
+    "1. The Magna Carta\n",
+    "2. William Shap McKechnie on Magna Carta book"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "study_references = [\"https://www.gutenberg.org/cache/epub/10000/pg10000.txt\", \"https://www.gutenberg.org/cache/epub/65363/pg65363.txt\"]\n",
+    "\n",
+    "duo.add(study_references)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now we can instantiate a `Quiz` object and generate a quiz based on the full input memory."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "quiz = duo.quiz(add_citations=True)\n",
+    "display(Markdown(quiz.generate()))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "{numref}`quiz` shows a sample quiz with citations. Marked in yellow are the citations which refer to the input IDs of the resources we added to the model.\n",
+    "\n",
+    "```{figure} ../_static/input/quiz.png\n",
+    "---\n",
+    "name: quiz\n",
+    "alt: Quiz with Citations\n",
+    "scale: 50%\n",
+    "align: center\n",
+    "---\n",
+    "Sample Quiz with Citations.\n",
+    "```\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Discussion\n",
+    "\n",
+    "The experiment demonstrated the ability to build a knowledge base from multiple sources while leveraging prompt caching for efficiency and generate quizzes with citations for verifiability. The system successfully ingested content from Project Gutenberg texts, including historical documents like the Magna Carta, and used them to create interactive educational content.\n",
+    "\n",
+    "However, several limitations emerged during this process:\n",
+    "\n",
+    "1. Memory Management: The system currently loads all content into memory, which could become problematic with larger knowledge bases. A more scalable approach might involve chunking or streaming the content.\n",
+    "\n",
+    "2. Citation Quality: While the system provides citations, they lack specificity - pointing to entire documents rather than specific passages or page numbers. This limits the ability to fact-check or verify specific claims.\n",
+    "\n",
+    "3. Content Verification: While citations are provided, the system is not guaranteed to provide factual information. This could lead to potential hallucinations or misinterpretations.\n",
+    "\n",
+    "While limitations are present in this simple example, the case study highlights that not always complex systems are needed. Alternative simple strategies should be preferred when possible, particularly if capable, long-context window models are available and fit within the application requirements.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Conclusion"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "[![CC BY-NC-SA 4.0][cc-by-nc-sa-image]][cc-by-nc-sa]\n",
+    "\n",
+    "[cc-by-nc-sa]: http://creativecommons.org/licenses/by-nc-sa/4.0/\n",
+    "[cc-by-nc-sa-image]: https://licensebuttons.net/l/by-nc-sa/4.0/88x31.png\n",
+    "[cc-by-nc-sa-shield]: https://img.shields.io/badge/License-CC-BY--NC--SA-4.0-lightgrey.svg\n",
+    "\n",
+    "```\n",
+    "@misc{tharsistpsouza2024tamingllms,\n",
+    "  author = {Tharsis T. P. Souza},\n",
+    "  title = {Taming LLMs: A Practical Guide to LLM Pitfalls with Open Source Software},\n",
+    "  year = {2024},\n",
+    "  chapter = {Managing Input Data},\n",
+    "  journal = {GitHub repository},\n",
+    "  url = {https://github.com/souzatharsis/tamingLLMs)\n",
+    "}\n",
+    "```\n",
+    "## References\n",
+    "```{bibliography}\n",
+    ":filter: docname in docnames\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/tamingllms/_build/html/_sources/notebooks/local.ipynb b/tamingllms/_build/html/_sources/notebooks/local.ipynb
index fa1f01e..7a717ce 100644
--- a/tamingllms/_build/html/_sources/notebooks/local.ipynb
+++ b/tamingllms/_build/html/_sources/notebooks/local.ipynb
@@ -181,11 +181,11 @@
     "Performance Comparison including proprietary models.\n",
     "```\n",
     "\n",
-    "Also from China, DeepSeek-V3 {cite}`deepseek2024v3` represents a major breakthrough in open source language models, emerging as arguably as the most capable open source large language model available today. With 671 billion parameters and 37 billion active MoE (Mixture of Experts) parameters, it achieves performance on par with leading proprietary models like Claude 3.5 Sonnet and GPT 4o as shown in {numref}`deep`. The model demonstrates impressive efficiency metrics (see {numref}`deep2`), processing input tokens at $0.27 per million and output tokens at $1.1 per million, while maintaining a generation speed of 60 tokens per second (3x faster than DeepSeek-V2).\n",
+    "Also from China, DeepSeek-V3 {cite}`deepseek2024v3` represents a major breakthrough in open source language models, emerging as arguably the most capable open source large language model available as of the end of 2024. With 671 billion parameters and 37 billion active MoE (Mixture of Experts) parameters, it achieves performance on par with leading proprietary models like Claude 3.5 Sonnet and GPT 4o as shown in {numref}`deep`. The model demonstrates impressive cost efficiency metrics (see {numref}`deep2`), processing input tokens at $0.27 per million and output tokens at $1.1 per million, while maintaining a generation speed of 60 tokens per second (3x faster than DeepSeek-V2).\n",
     "\n",
-    "What makes DeepSeek-V3 particularly remarkable is that these capabilities were achieved with a relatively modest training budget of just $5.5 million, used to train on 14.8 trillion tokens. This efficiency in training demonstrates the potential for open source models to compete with proprietary alternatives at a fraction of the cost. The model's release marks a significant milestone in the democratization of advanced AI capabilities, challenging the dominance of proprietary models.\n",
+    "What makes DeepSeek-V3 particularly remarkable is that these capabilities were achieved with a relatively modest training budget of just $5.5 million, used to train on 14.8 trillion tokens. This efficiency in training demonstrates the potential for open source models to compete with proprietary alternatives at a fraction of the cost. The model's release marks a significant milestone in the democratization of advanced AI capabilities, challenging the dominance of proprietary models within big tech. One should be cautious though as the model has not yet been battle-tested in the wild but this is an exciting development demonstrating the potential of open source models to compete with proprietary alternatives.\n",
     "\n",
-    "```{figure} ../_static/local/deep.png\n",
+    "```{figure} ../_static/local/deep.jpeg\n",
     "---\n",
     "name: deep\n",
     "alt: DeepSeek-V3\n",
@@ -195,7 +195,7 @@
     "DeepSeek-V3 Performance Comparison\n",
     "```\n",
     "\n",
-    "```{figure} ../_static/local/deep2.png\n",
+    "```{figure} ../_static/local/deep2.jpeg\n",
     "---\n",
     "name: deep2\n",
     "alt: DeepSeek-V3 Cost Benefit Analysis\n",
diff --git a/tamingllms/_build/html/_static/input/anth_contextual.png b/tamingllms/_build/html/_static/input/anth_contextual.png
new file mode 100644
index 0000000..c8401c0
Binary files /dev/null and b/tamingllms/_build/html/_static/input/anth_contextual.png differ
diff --git a/tamingllms/_build/html/_static/input/asset_class.png b/tamingllms/_build/html/_static/input/asset_class.png
new file mode 100644
index 0000000..237d081
Binary files /dev/null and b/tamingllms/_build/html/_static/input/asset_class.png differ
diff --git a/tamingllms/_build/html/_static/input/docling.png b/tamingllms/_build/html/_static/input/docling.png
new file mode 100644
index 0000000..143ded9
Binary files /dev/null and b/tamingllms/_build/html/_static/input/docling.png differ
diff --git a/tamingllms/_build/html/_static/input/markitdown.png b/tamingllms/_build/html/_static/input/markitdown.png
new file mode 100644
index 0000000..282503c
Binary files /dev/null and b/tamingllms/_build/html/_static/input/markitdown.png differ
diff --git a/tamingllms/_build/html/genindex.html b/tamingllms/_build/html/genindex.html
index ca967a8..6b43854 100644
--- a/tamingllms/_build/html/genindex.html
+++ b/tamingllms/_build/html/genindex.html
@@ -160,6 +160,15 @@
             
 
             
+          </li>
+
+        
+          <li class="toctree-l1 ">
+            
+              <a href="notebooks/input.html" class="reference internal ">Managing Input Data</a>
+            
+
+            
           </li>
 
         
diff --git a/tamingllms/_build/html/markdown/intro.html b/tamingllms/_build/html/markdown/intro.html
index 2fe936a..0ac47eb 100644
--- a/tamingllms/_build/html/markdown/intro.html
+++ b/tamingllms/_build/html/markdown/intro.html
@@ -178,6 +178,15 @@
             
 
             
+          </li>
+
+        
+          <li class="toctree-l1 ">
+            
+              <a href="../notebooks/input.html" class="reference internal ">Managing Input Data</a>
+            
+
+            
           </li>
 
         
diff --git a/tamingllms/_build/html/markdown/preface.html b/tamingllms/_build/html/markdown/preface.html
index 7f5b8d9..18b7f65 100644
--- a/tamingllms/_build/html/markdown/preface.html
+++ b/tamingllms/_build/html/markdown/preface.html
@@ -160,6 +160,15 @@
             
 
             
+          </li>
+
+        
+          <li class="toctree-l1 ">
+            
+              <a href="../notebooks/input.html" class="reference internal ">Managing Input Data</a>
+            
+
+            
           </li>
 
         
@@ -236,7 +245,7 @@ <h1><span class="section-number">1. </span>Preface<a class="headerlink" href="#p
 <div><p>Models tell you merely what something is like, not what something is.</p>
 <p class="attribution">—Emanuel Derman</p>
 </div></blockquote>
-<p>An alternative title of this book could have been “Language Models Behaving Badly”. If you are coming from a background in financial modeling, you may have noticed the parallel with Emanuel Derman’s seminal work “Models.Behaving.Badly” <span id="id1">[<a class="reference internal" href="#id169" title="E. Derman. Models.Behaving.Badly.: Why Confusing Illusion with Reality Can Lead to Disaster, on Wall Street and in Life. Free Press, 2011. ISBN 9781439165010. URL: https://books.google.co.uk/books?id=lke_cwM4wm8C.">Derman, 2011</a>]</span>. This parallel is not coincidental. Just as Derman cautioned against treating financial models as perfect representations of reality, this book aims to highlight the limitations and pitfalls of Large Language Models (LLMs) in practical applications.</p>
+<p>An alternative title of this book could have been “Language Models Behaving Badly”. If you are coming from a background in financial modeling, you may have noticed the parallel with Emanuel Derman’s seminal work “Models.Behaving.Badly” <span id="id1">[<a class="reference internal" href="#id176" title="E. Derman. Models.Behaving.Badly.: Why Confusing Illusion with Reality Can Lead to Disaster, on Wall Street and in Life. Free Press, 2011. ISBN 9781439165010. URL: https://books.google.co.uk/books?id=lke_cwM4wm8C.">Derman, 2011</a>]</span>. This parallel is not coincidental. Just as Derman cautioned against treating financial models as perfect representations of reality, this book aims to highlight the limitations and pitfalls of Large Language Models (LLMs) in practical applications.</p>
 <p>The book “Models.Behaving.Badly” by Emanuel Derman, a former physicist and Goldman Sachs quant, explores how financial and scientific models can fail when we mistake them for reality rather than treating them as approximations full of assumptions.
 The core premise of his work is that while models can be useful tools for understanding aspects of the world, they inherently involve simplification and assumptions. Derman argues that many financial crises, including the 2008 crash, occurred partly because people put too much faith in mathematical models without recognizing their limitations.</p>
 <p>Like financial models that failed to capture the complexity of human behavior and market dynamics, LLMs have inherent constraints. They can hallucinate facts, struggle with logical reasoning, and fail to maintain consistency across long outputs. Their responses, while often convincing, are probabilistic approximations based on training data rather than true understanding even though humans insist on treating them as “machines that can reason”.</p>
@@ -244,7 +253,7 @@ <h1><span class="section-number">1. </span>Preface<a class="headerlink" href="#p
 <p>This book serves as an introductory, practical guide for practitioners and technology product builders - software engineers, data scientists, and product managers - who want to create the next generation of GenAI-based products with LLMs while remaining clear-eyed about their limitations and therefore their implications to end-users. Through detailed technical analysis, reproducible Python code examples we explore the gap between LLM capabilities and reliable software product development.</p>
 <p>The goal is not to diminish the transformative potential of LLMs, but rather to promote a more nuanced understanding of their behavior. By acknowledging and working within their constraints, developers can create more reliable and trustworthy applications. After all, as Derman taught us, the first step to using a model effectively is understanding where it breaks down.</p>
 <div class="docutils container" id="id2">
-<div class="citation" id="id169" role="doc-biblioentry">
+<div class="citation" id="id176" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id1">Der11</a><span class="fn-bracket">]</span></span>
 <p>E. Derman. <em>Models.Behaving.Badly.: Why Confusing Illusion with Reality Can Lead to Disaster, on Wall Street and in Life</em>. Free Press, 2011. ISBN 9781439165010. URL: <a class="reference external" href="https://books.google.co.uk/books?id=lke_cwM4wm8C">https://books.google.co.uk/books?id=lke_cwM4wm8C</a>.</p>
 </div>
diff --git a/tamingllms/_build/html/markdown/toc.html b/tamingllms/_build/html/markdown/toc.html
index 718db23..2ceca2e 100644
--- a/tamingllms/_build/html/markdown/toc.html
+++ b/tamingllms/_build/html/markdown/toc.html
@@ -153,6 +153,15 @@
             
 
             
+          </li>
+
+        
+          <li class="toctree-l1 ">
+            
+              <a href="../notebooks/input.html" class="reference internal ">Managing Input Data</a>
+            
+
+            
           </li>
 
         
diff --git a/tamingllms/_build/html/notebooks/alignment.html b/tamingllms/_build/html/notebooks/alignment.html
index b518c25..7a59c09 100644
--- a/tamingllms/_build/html/notebooks/alignment.html
+++ b/tamingllms/_build/html/notebooks/alignment.html
@@ -4,7 +4,7 @@
     <meta charset="utf-8">
     <meta name="viewport" content="width=device-width,initial-scale=1"><meta name="generator" content="Docutils 0.19: https://docutils.sourceforge.io/" />
 
-      <title>6. Preference-Based Alignment</title>
+      <title>7. Preference-Based Alignment</title>
     
           <link rel="stylesheet" href="../_static/pygments.css" type="text/css" />
           <link rel="stylesheet" href="../_static/theme.css " type="text/css" />
@@ -47,8 +47,8 @@
     
   <link rel="index" title="Index" href="../genindex.html" />
   <link rel="search" title="Search" href="../search.html" />
-  <link rel="next" title="7. Local LLMs in Practice" href="local.html" />
-  <link rel="prev" title="5. Safety" href="safety.html" /> 
+  <link rel="next" title="8. Local LLMs in Practice" href="local.html" />
+  <link rel="prev" title="6. Safety" href="safety.html" /> 
   </head>
 
   <body>
@@ -164,6 +164,15 @@
             
 
             
+          </li>
+
+        
+          <li class="toctree-l1 ">
+            
+              <a href="input.html" class="reference internal ">Managing Input Data</a>
+            
+
+            
           </li>
 
         
@@ -233,18 +242,18 @@
   <ul class="breadcrumbs">
     <li><a href="../markdown/toc.html">Docs</a> &raquo;</li>
     
-    <li><span class="section-number">6. </span>Preference-Based Alignment</li>
+    <li><span class="section-number">7. </span>Preference-Based Alignment</li>
   </ul>
   
 
   <ul class="page-nav">
   <li class="prev">
     <a href="safety.html"
-       title="previous chapter">← <span class="section-number">5. </span>Safety</a>
+       title="previous chapter">← <span class="section-number">6. </span>Safety</a>
   </li>
   <li class="next">
     <a href="local.html"
-       title="next chapter"><span class="section-number">7. </span>Local LLMs in Practice →</a>
+       title="next chapter"><span class="section-number">8. </span>Local LLMs in Practice →</a>
   </li>
 </ul>
   
@@ -253,7 +262,7 @@
           <div class="content" role="main" v-pre>
             
   <section class="tex2jax_ignore mathjax_ignore" id="preference-based-alignment">
-<span id="alignment"></span><h1><a class="toc-backref" href="#id235" role="doc-backlink"><span class="section-number">6. </span>Preference-Based Alignment</a><a class="headerlink" href="#preference-based-alignment" title="Permalink to this heading">¶</a></h1>
+<span id="alignment"></span><h1><a class="toc-backref" href="#id242" role="doc-backlink"><span class="section-number">7. </span>Preference-Based Alignment</a><a class="headerlink" href="#preference-based-alignment" title="Permalink to this heading">¶</a></h1>
 <blockquote class="epigraph">
 <div><p>A people that values its privileges above its principles soon loses both.</p>
 <p class="attribution">—Dwight D. Eisenhower</p>
@@ -261,71 +270,71 @@
 <nav class="contents" id="contents">
 <p class="topic-title">Contents</p>
 <ul class="simple">
-<li><p><a class="reference internal" href="#preference-based-alignment" id="id235">Preference-Based Alignment</a></p>
+<li><p><a class="reference internal" href="#preference-based-alignment" id="id242">Preference-Based Alignment</a></p>
 <ul>
-<li><p><a class="reference internal" href="#introduction" id="id236">Introduction</a></p></li>
-<li><p><a class="reference internal" href="#from-raw-capabilities-to-preference-alignment" id="id237">From Raw Capabilities to Preference Alignment</a></p>
+<li><p><a class="reference internal" href="#introduction" id="id243">Introduction</a></p></li>
+<li><p><a class="reference internal" href="#from-raw-capabilities-to-preference-alignment" id="id244">From Raw Capabilities to Preference Alignment</a></p>
 <ul>
-<li><p><a class="reference internal" href="#on-the-misalignment-of-language-models" id="id238">On the Misalignment of Language Models</a></p></li>
-<li><p><a class="reference internal" href="#aligning-language-models-with-human-preferences" id="id239">Aligning Language Models with Human Preferences</a></p>
+<li><p><a class="reference internal" href="#on-the-misalignment-of-language-models" id="id245">On the Misalignment of Language Models</a></p></li>
+<li><p><a class="reference internal" href="#aligning-language-models-with-human-preferences" id="id246">Aligning Language Models with Human Preferences</a></p>
 <ul>
-<li><p><a class="reference internal" href="#supervised-fine-tuning-sft-for-model-alignment" id="id240">Supervised Fine-Tuning (SFT) for Model Alignment</a></p></li>
-<li><p><a class="reference internal" href="#augmenting-sft-with-human-preferences" id="id241">Augmenting SFT with Human Preferences</a></p></li>
+<li><p><a class="reference internal" href="#supervised-fine-tuning-sft-for-model-alignment" id="id247">Supervised Fine-Tuning (SFT) for Model Alignment</a></p></li>
+<li><p><a class="reference internal" href="#augmenting-sft-with-human-preferences" id="id248">Augmenting SFT with Human Preferences</a></p></li>
 </ul>
 </li>
 </ul>
 </li>
-<li><p><a class="reference internal" href="#is-post-training-the-answer" id="id242">Is Post-Training the Answer?</a></p>
+<li><p><a class="reference internal" href="#is-post-training-the-answer" id="id249">Is Post-Training the Answer?</a></p>
 <ul>
-<li><p><a class="reference internal" href="#limitations" id="id243">Limitations</a></p></li>
-<li><p><a class="reference internal" href="#model-collapse" id="id244">Model Collapse</a></p></li>
-<li><p><a class="reference internal" href="#faking-alignment" id="id245">Faking Alignment</a></p></li>
+<li><p><a class="reference internal" href="#limitations" id="id250">Limitations</a></p></li>
+<li><p><a class="reference internal" href="#model-collapse" id="id251">Model Collapse</a></p></li>
+<li><p><a class="reference internal" href="#faking-alignment" id="id252">Faking Alignment</a></p></li>
 </ul>
 </li>
-<li><p><a class="reference internal" href="#case-study-aligning-a-language-model-to-a-policy" id="id246">Case Study: Aligning a Language Model to a Policy</a></p>
+<li><p><a class="reference internal" href="#case-study-aligning-a-language-model-to-a-policy" id="id253">Case Study: Aligning a Language Model to a Policy</a></p>
 <ul>
-<li><p><a class="reference internal" href="#experimental-setup" id="id247">Experimental Setup</a></p></li>
-<li><p><a class="reference internal" href="#deliverables" id="id248">Deliverables</a></p></li>
-<li><p><a class="reference internal" href="#a-note-on-smollm2-models" id="id249">A Note on smolLM2 Models</a></p>
+<li><p><a class="reference internal" href="#experimental-setup" id="id254">Experimental Setup</a></p></li>
+<li><p><a class="reference internal" href="#deliverables" id="id255">Deliverables</a></p></li>
+<li><p><a class="reference internal" href="#a-note-on-smollm2-models" id="id256">A Note on smolLM2 Models</a></p>
 <ul>
-<li><p><a class="reference internal" href="#policy" id="id250">Policy</a></p></li>
+<li><p><a class="reference internal" href="#policy" id="id257">Policy</a></p></li>
 </ul>
 </li>
-<li><p><a class="reference internal" href="#preference-dataset-synthetic-dataset-generation" id="id251">Preference Dataset - Synthetic Dataset Generation</a></p>
+<li><p><a class="reference internal" href="#preference-dataset-synthetic-dataset-generation" id="id258">Preference Dataset - Synthetic Dataset Generation</a></p>
 <ul>
-<li><p><a class="reference internal" href="#user-prompts" id="id252">User Prompts</a></p></li>
-<li><p><a class="reference internal" href="#rejected-responses" id="id253">Rejected Responses</a></p></li>
-<li><p><a class="reference internal" href="#chosen-responses" id="id254">Chosen Responses</a></p></li>
-<li><p><a class="reference internal" href="#generate-dpo-dataset" id="id255">Generate DPO Dataset</a></p></li>
+<li><p><a class="reference internal" href="#user-prompts" id="id259">User Prompts</a></p></li>
+<li><p><a class="reference internal" href="#rejected-responses" id="id260">Rejected Responses</a></p></li>
+<li><p><a class="reference internal" href="#chosen-responses" id="id261">Chosen Responses</a></p></li>
+<li><p><a class="reference internal" href="#generate-dpo-dataset" id="id262">Generate DPO Dataset</a></p></li>
 </ul>
 </li>
-<li><p><a class="reference internal" href="#dpo-based-optimization" id="id256">DPO-Based Optimization</a></p>
+<li><p><a class="reference internal" href="#dpo-based-optimization" id="id263">DPO-Based Optimization</a></p>
 <ul>
-<li><p><a class="reference internal" href="#data-preparation" id="id257">Data Preparation</a></p></li>
-<li><p><a class="reference internal" href="#fine-tuning" id="id258">Fine-Tuning</a></p></li>
-<li><p><a class="reference internal" href="#vibe-check" id="id259">Vibe Check</a></p></li>
+<li><p><a class="reference internal" href="#data-preparation" id="id264">Data Preparation</a></p></li>
+<li><p><a class="reference internal" href="#fine-tuning" id="id265">Fine-Tuning</a></p></li>
+<li><p><a class="reference internal" href="#vibe-check" id="id266">Vibe Check</a></p></li>
 </ul>
 </li>
-<li><p><a class="reference internal" href="#alignment-evaluation" id="id260">Alignment Evaluation</a></p></li>
+<li><p><a class="reference internal" href="#alignment-evaluation" id="id267">Alignment Evaluation</a></p></li>
 </ul>
 </li>
-<li><p><a class="reference internal" href="#discussion-and-conclusions" id="id261">Discussion and Conclusions</a></p></li>
-<li><p><a class="reference internal" href="#citation" id="id262">Citation</a></p></li>
-<li><p><a class="reference internal" href="#references" id="id263">References</a></p></li>
+<li><p><a class="reference internal" href="#discussion-and-conclusions" id="id268">Discussion and Conclusions</a></p></li>
+<li><p><a class="reference internal" href="#citation" id="id269">Citation</a></p></li>
+<li><p><a class="reference internal" href="#references" id="id270">References</a></p></li>
 </ul>
 </li>
 </ul>
 </nav>
 <section id="introduction">
-<h2><a class="toc-backref" href="#id236" role="doc-backlink"><span class="section-number">6.1. </span>Introduction</a><a class="headerlink" href="#introduction" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id243" role="doc-backlink"><span class="section-number">7.1. </span>Introduction</a><a class="headerlink" href="#introduction" title="Permalink to this heading">¶</a></h2>
 <p>The release of ChatGPT 3.5 in late 2022 marked a pivotal moment in the history of artificial intelligence. Within just five days of its launch, the model attracted over a million users, and within two months, it became the fastest-growing consumer application in history with over 100 million monthly active users.</p>
 <p>Yet, this raises an intriguing question: Why did ChatGPT 3.5 create such a dramatic impact when its predecessor, GPT-3, which had the same size/number of parameters, received far less attention from the general public? Arguably, the answer lies not in raw capabilities, but in Preference Alignment. Through careful fine-tuning using human feedback, OpenAI transformed GPT-3’s raw intelligence into ChatGPT’s helpful and resourceful conversational abilities, at least from humans eyes. This breakthrough demonstrated that aligning language models with human preferences is just as crucial as scaling them to greater sizes.</p>
 <p>In this chapter, we will explore the process of aligning language models with human preferences via fine-tuning using modern techniques such as Direct Preference Optimization (DPO) <span id="id1">[<a class="reference internal" href="#id176" title="Rafael Rafailov, Archit Sharma, Eric Mitchell, Stefano Ermon, Christopher D. Manning, and Chelsea Finn. Direct preference optimization: your language model is secretly a reward model. 2024. URL: https://arxiv.org/abs/2305.18290, arXiv:2305.18290.">Rafailov <em>et al.</em>, 2024</a>]</span>. Next, we will present a practical case study where we align a language model to a user-provided policy in a fully automated fashion leading to an open source model as well as a dataset of policy-aligned preferences.</p>
 </section>
 <section id="from-raw-capabilities-to-preference-alignment">
-<h2><a class="toc-backref" href="#id237" role="doc-backlink"><span class="section-number">6.2. </span>From Raw Capabilities to Preference Alignment</a><a class="headerlink" href="#from-raw-capabilities-to-preference-alignment" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id244" role="doc-backlink"><span class="section-number">7.2. </span>From Raw Capabilities to Preference Alignment</a><a class="headerlink" href="#from-raw-capabilities-to-preference-alignment" title="Permalink to this heading">¶</a></h2>
 <section id="on-the-misalignment-of-language-models">
-<h3><a class="toc-backref" href="#id238" role="doc-backlink"><span class="section-number">6.2.1. </span>On the Misalignment of Language Models</a><a class="headerlink" href="#on-the-misalignment-of-language-models" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id245" role="doc-backlink"><span class="section-number">7.2.1. </span>On the Misalignment of Language Models</a><a class="headerlink" href="#on-the-misalignment-of-language-models" title="Permalink to this heading">¶</a></h3>
 <p>Common pre-trained LLMs are not helpful to humans by default. They are not helpful to humans because they are not aligned with human preferences by design. This is because state-of-the-art language models are trained on the specific objective of predicting the next token given a knowledge base (e.g. large number of webpages from the internet). This is a very different objective than being asked to follow user’s instructions while being safe and helpful. We say that the language modeling objective is misaligned <span id="id2">[<a class="reference internal" href="#id172" title="Long Ouyang, Jeff Wu, Xu Jiang, Diogo Almeida, Carroll L. Wainwright, Pamela Mishkin, Chong Zhang, Sandhini Agarwal, Katarina Slama, Alex Ray, John Schulman, Jacob Hilton, Fraser Kelton, Luke Miller, Maddie Simens, Amanda Askell, Peter Welinder, Paul Christiano, Jan Leike, and Ryan Lowe. Training language models to follow instructions with human feedback. 2022. URL: https://arxiv.org/abs/2203.02155, arXiv:2203.02155.">Ouyang <em>et al.</em>, 2022</a>]</span>.</p>
 <p>Let’s take a look at GPT-2’s response to the following prompt: “Explain the moon landing to a 6 year old.”</p>
 <div class="cell docutils container">
@@ -374,15 +383,15 @@ <h3><a class="toc-backref" href="#id238" role="doc-backlink"><span class="sectio
 <p>As we can see from the responses above, GPT-2 fails to provide a coherent and helpful explanation of the moon landing to a 6-year-old child. The model generates nonsensical text that meanders between unrelated topics like “green dots”, “movie endings”, and “the word tepid”. This is a simple demonstration that raw language models, while capable of generating text, are not inherently aligned with the goal of being helpful to humans. The model lacks the understanding that it should provide a simple, clear explanation appropriate for a young child. Instead, it predicts the next token given a knowledge base.</p>
 </section>
 <section id="aligning-language-models-with-human-preferences">
-<h3><a class="toc-backref" href="#id239" role="doc-backlink"><span class="section-number">6.2.2. </span>Aligning Language Models with Human Preferences</a><a class="headerlink" href="#aligning-language-models-with-human-preferences" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id246" role="doc-backlink"><span class="section-number">7.2.2. </span>Aligning Language Models with Human Preferences</a><a class="headerlink" href="#aligning-language-models-with-human-preferences" title="Permalink to this heading">¶</a></h3>
 <p>To address this issue, OpenAI introduced a RLHF-based technique to align language models with user intent on a wide range of tasks by fine-tuning with human feedback <span id="id3">[<a class="reference internal" href="#id172" title="Long Ouyang, Jeff Wu, Xu Jiang, Diogo Almeida, Carroll L. Wainwright, Pamela Mishkin, Chong Zhang, Sandhini Agarwal, Katarina Slama, Alex Ray, John Schulman, Jacob Hilton, Fraser Kelton, Luke Miller, Maddie Simens, Amanda Askell, Peter Welinder, Paul Christiano, Jan Leike, and Ryan Lowe. Training language models to follow instructions with human feedback. 2022. URL: https://arxiv.org/abs/2203.02155, arXiv:2203.02155.">Ouyang <em>et al.</em>, 2022</a>]</span>. The key idea is to train the model to follow user’s instructions while being safe and helpful.</p>
 <figure class="align-center" id="openai-rlhf">
 <a class="reference internal image-reference" href="../_images/openai_rlhf.png"><img alt="OpenAI RLHF Pipeline" src="../_images/openai_rlhf.png" style="width: 729.05px; height: 421.4px;" /></a>
 <figcaption>
-<p><span class="caption-number">Fig. 6.1 </span><span class="caption-text">OpenAI’s RLHF pipeline for aligning language models with human preferences <span id="id4">[<a class="reference internal" href="#id172" title="Long Ouyang, Jeff Wu, Xu Jiang, Diogo Almeida, Carroll L. Wainwright, Pamela Mishkin, Chong Zhang, Sandhini Agarwal, Katarina Slama, Alex Ray, John Schulman, Jacob Hilton, Fraser Kelton, Luke Miller, Maddie Simens, Amanda Askell, Peter Welinder, Paul Christiano, Jan Leike, and Ryan Lowe. Training language models to follow instructions with human feedback. 2022. URL: https://arxiv.org/abs/2203.02155, arXiv:2203.02155.">Ouyang <em>et al.</em>, 2022</a>]</span>.</span><a class="headerlink" href="#openai-rlhf" title="Permalink to this image">¶</a></p>
+<p><span class="caption-number">Fig. 7.1 </span><span class="caption-text">OpenAI’s RLHF pipeline for aligning language models with human preferences <span id="id4">[<a class="reference internal" href="#id172" title="Long Ouyang, Jeff Wu, Xu Jiang, Diogo Almeida, Carroll L. Wainwright, Pamela Mishkin, Chong Zhang, Sandhini Agarwal, Katarina Slama, Alex Ray, John Schulman, Jacob Hilton, Fraser Kelton, Luke Miller, Maddie Simens, Amanda Askell, Peter Welinder, Paul Christiano, Jan Leike, and Ryan Lowe. Training language models to follow instructions with human feedback. 2022. URL: https://arxiv.org/abs/2203.02155, arXiv:2203.02155.">Ouyang <em>et al.</em>, 2022</a>]</span>.</span><a class="headerlink" href="#openai-rlhf" title="Permalink to this image">¶</a></p>
 </figcaption>
 </figure>
-<p><a class="reference internal" href="#openai-rlhf"><span class="std std-numref">Fig. 6.1</span></a> illustrates OpenAI’s 3-step process for training language models to better follow human instructions using RLHF:</p>
+<p><a class="reference internal" href="#openai-rlhf"><span class="std std-numref">Fig. 7.1</span></a> illustrates OpenAI’s 3-step process for training language models to better follow human instructions using RLHF:</p>
 <ol class="arabic simple">
 <li><p>Collect demonstration data and train a supervised policy</p></li>
 </ol>
@@ -414,24 +423,24 @@ <h3><a class="toc-backref" href="#id239" role="doc-backlink"><span class="sectio
 <li><p>Provide more helpful and appropriate responses</p></li>
 <li><p>Avoid harmful or undesired behaviors</p></li>
 </ul>
-<p><a class="reference internal" href="#alignment-simplified"><span class="std std-numref">Fig. 6.2</span></a> illustrates a simplified view of this alignment process showing the progression from base model to instruction-tuned model to aligned model.</p>
+<p><a class="reference internal" href="#alignment-simplified"><span class="std std-numref">Fig. 7.2</span></a> illustrates a simplified view of this alignment process showing the progression from base model to instruction-tuned model to aligned model.</p>
 <figure class="align-center" id="alignment-simplified">
 <a class="reference internal image-reference" href="../_images/alignment_simplified.png"><img alt="Alignment Simplified" src="../_images/alignment_simplified.png" style="width: 979.1999999999999px; height: 257.4px;" /></a>
 <figcaption>
-<p><span class="caption-number">Fig. 6.2 </span><span class="caption-text">Simplified view of the alignment process showing the progression from base model to instruction-tuned model to aligned model <span id="id5">[<a class="reference internal" href="#id172" title="Long Ouyang, Jeff Wu, Xu Jiang, Diogo Almeida, Carroll L. Wainwright, Pamela Mishkin, Chong Zhang, Sandhini Agarwal, Katarina Slama, Alex Ray, John Schulman, Jacob Hilton, Fraser Kelton, Luke Miller, Maddie Simens, Amanda Askell, Peter Welinder, Paul Christiano, Jan Leike, and Ryan Lowe. Training language models to follow instructions with human feedback. 2022. URL: https://arxiv.org/abs/2203.02155, arXiv:2203.02155.">Ouyang <em>et al.</em>, 2022</a>]</span>.</span><a class="headerlink" href="#alignment-simplified" title="Permalink to this image">¶</a></p>
+<p><span class="caption-number">Fig. 7.2 </span><span class="caption-text">Simplified view of the alignment process showing the progression from base model to instruction-tuned model to aligned model <span id="id5">[<a class="reference internal" href="#id172" title="Long Ouyang, Jeff Wu, Xu Jiang, Diogo Almeida, Carroll L. Wainwright, Pamela Mishkin, Chong Zhang, Sandhini Agarwal, Katarina Slama, Alex Ray, John Schulman, Jacob Hilton, Fraser Kelton, Luke Miller, Maddie Simens, Amanda Askell, Peter Welinder, Paul Christiano, Jan Leike, and Ryan Lowe. Training language models to follow instructions with human feedback. 2022. URL: https://arxiv.org/abs/2203.02155, arXiv:2203.02155.">Ouyang <em>et al.</em>, 2022</a>]</span>.</span><a class="headerlink" href="#alignment-simplified" title="Permalink to this image">¶</a></p>
 </figcaption>
 </figure>
-<p>A common pattern has emerged in the development of language models: First, a powerful base model is released, which is then fine-tuned, for instance using SFT to create an instruction-following version. This instruct model can then be further aligned with human preferences using techniques such as RLHF to create an aligned version as illustrated in <a class="reference internal" href="#instruct"><span class="std std-numref">Fig. 6.3</span></a>.</p>
+<p>A common pattern has emerged in the development of language models: First, a powerful base model is released, which is then fine-tuned, for instance using SFT to create an instruction-following version. This instruct model can then be further aligned with human preferences using techniques such as RLHF to create an aligned version as illustrated in <a class="reference internal" href="#instruct"><span class="std std-numref">Fig. 7.3</span></a>.</p>
 <figure class="align-center" id="instruct">
 <a class="reference internal image-reference" href="../_images/instruct.png"><img alt="Instruction fine-tuning process" src="../_images/instruct.png" style="width: 966.6999999999999px; height: 371.7px;" /></a>
 <figcaption>
-<p><span class="caption-number">Fig. 6.3 </span><span class="caption-text">Instruction fine-tuning process for aligning language models with human preferences.</span><a class="headerlink" href="#instruct" title="Permalink to this image">¶</a></p>
+<p><span class="caption-number">Fig. 7.3 </span><span class="caption-text">Instruction fine-tuning process for aligning language models with human preferences.</span><a class="headerlink" href="#instruct" title="Permalink to this image">¶</a></p>
 </figcaption>
 </figure>
 <p>An aligned model can be fine-tuned directly from a base model or from an instruction-tuned model. For example, Llama Guard 3 <span id="id6">[<a class="reference internal" href="#id174" title="AI &#64; Meta Llama Team. The llama 3 herd of models. 2024. URL: https://arxiv.org/abs/2407.21783, arXiv:2407.21783.">Llama Team, 2024</a>]</span> is a Llama-3.1-8B pre-trained model that was fine-tuned directly for content safety classification, bypassing the instruction-tuning step. Similarly, Zephyr-7B-alpha <span id="id7">[<a class="reference internal" href="#id173" title="Hugging Face. Zephyr. 2024. Zephyr. URL: https://huggingface.co/HuggingFaceH4/zephyr-7b-alpha.">Face, 2024</a>]</span> demonstrates direct alignment from a base model - it is a fine-tuned version of Mistral-7B that was trained using Direct Preference Optimization (DPO) on publicly available datasets to create a helpful assistant.</p>
 <p>The OpenAI paper introduced two key components of this fine-tuning process - SFT for instruction tuning and RLHF (PPO in particular) for alignment. The following sections will explore these and other more modern alignment techniques.</p>
 <section id="supervised-fine-tuning-sft-for-model-alignment">
-<h4><a class="toc-backref" href="#id240" role="doc-backlink"><span class="section-number">6.2.2.1. </span>Supervised Fine-Tuning (SFT) for Model Alignment</a><a class="headerlink" href="#supervised-fine-tuning-sft-for-model-alignment" title="Permalink to this heading">¶</a></h4>
+<h4><a class="toc-backref" href="#id247" role="doc-backlink"><span class="section-number">7.2.2.1. </span>Supervised Fine-Tuning (SFT) for Model Alignment</a><a class="headerlink" href="#supervised-fine-tuning-sft-for-model-alignment" title="Permalink to this heading">¶</a></h4>
 <p>SFT is a foundational technique for aligning language models with human preferences. Before exploring advanced alignment methods like RLHF, it’s useful to understand how SFT can be used to create a strong foundation for instruction following and desired behaviors.</p>
 <p>At a high-level, SFT involves fine-tuning language models using carefully curated demonstrations of desired behavior. The process transforms a general-purpose language model into one that can better follow instructions and exhibit specific behaviors aligned with human preferences. Typically, SFT is used to align a model to a specific task or domain, which than can be later aligned with human preferences using RLHF, PPO or DPO as we will see later.</p>
 <p>The decision to employ SFT depends on the gap between a model’s current capabilities and specific requirements. SFT proves particularly valuable in scenarios requiring:</p>
@@ -449,14 +458,14 @@ <h4><a class="toc-backref" href="#id240" role="doc-backlink"><span class="sectio
 <li><p>Requires significant computational resources</p></li>
 </ul>
 </li>
-<li><p><strong>LoRA (Low-Rank Adaptation)</strong> <span id="id8">[<a class="reference internal" href="#id181" title="Edward J. Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, and Weizhu Chen. Lora: low-rank adaptation of large language models. 2021. URL: https://arxiv.org/abs/2106.09685, arXiv:2106.09685.">Hu <em>et al.</em>, 2021</a>]</span></p>
+<li><p><strong>LoRA (Low-Rank Adaptation)</strong> <span id="id8">[<a class="reference internal" href="#id188" title="Edward J. Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, and Weizhu Chen. Lora: low-rank adaptation of large language models. 2021. URL: https://arxiv.org/abs/2106.09685, arXiv:2106.09685.">Hu <em>et al.</em>, 2021</a>]</span></p>
 <ul class="simple">
 <li><p>Uses two small matrices instead of updating all weights</p></li>
 <li><p>Maintains model performance while reducing computational costs</p></li>
 <li><p>Enables efficient training on consumer hardware</p></li>
 </ul>
 </li>
-<li><p><strong>QLoRA (Quantized LoRA)</strong> <span id="id9">[<a class="reference internal" href="#id182" title="Tim Dettmers, Artidoro Pagnoni, Ari Holtzman, and Luke Zettlemoyer. Qlora: efficient finetuning of quantized llms. 2023. URL: https://arxiv.org/abs/2305.14314, arXiv:2305.14314.">Dettmers <em>et al.</em>, 2023</a>]</span></p>
+<li><p><strong>QLoRA (Quantized LoRA)</strong> <span id="id9">[<a class="reference internal" href="#id189" title="Tim Dettmers, Artidoro Pagnoni, Ari Holtzman, and Luke Zettlemoyer. Qlora: efficient finetuning of quantized llms. 2023. URL: https://arxiv.org/abs/2305.14314, arXiv:2305.14314.">Dettmers <em>et al.</em>, 2023</a>]</span></p>
 <ul class="simple">
 <li><p>Combines LoRA with weight quantization</p></li>
 <li><p>Further reduces memory footprint</p></li>
@@ -468,16 +477,16 @@ <h4><a class="toc-backref" href="#id240" role="doc-backlink"><span class="sectio
 <p>SFT can be seen as a form of behavior cloning of humans. Recently, there has been research on using RLHF or DPO <span id="id11">[<a class="reference internal" href="#id176" title="Rafael Rafailov, Archit Sharma, Eric Mitchell, Stefano Ermon, Christopher D. Manning, and Chelsea Finn. Direct preference optimization: your language model is secretly a reward model. 2024. URL: https://arxiv.org/abs/2305.18290, arXiv:2305.18290.">Rafailov <em>et al.</em>, 2024</a>]</span> to maximize human preference rather than clone their behavior, which has been shown to be more effective than SFT alone <span id="id12">[<a class="reference internal" href="#id172" title="Long Ouyang, Jeff Wu, Xu Jiang, Diogo Almeida, Carroll L. Wainwright, Pamela Mishkin, Chong Zhang, Sandhini Agarwal, Katarina Slama, Alex Ray, John Schulman, Jacob Hilton, Fraser Kelton, Luke Miller, Maddie Simens, Amanda Askell, Peter Welinder, Paul Christiano, Jan Leike, and Ryan Lowe. Training language models to follow instructions with human feedback. 2022. URL: https://arxiv.org/abs/2203.02155, arXiv:2203.02155.">Ouyang <em>et al.</em>, 2022</a>]</span>, which we will explore next.</p>
 </section>
 <section id="augmenting-sft-with-human-preferences">
-<h4><a class="toc-backref" href="#id241" role="doc-backlink"><span class="section-number">6.2.2.2. </span>Augmenting SFT with Human Preferences</a><a class="headerlink" href="#augmenting-sft-with-human-preferences" title="Permalink to this heading">¶</a></h4>
-<p>Significant gains in helpfulness and safety can be achieved by augmenting SFT with human preferences <span id="id13">[<a class="reference internal" href="#id180" title="Yuntao Bai, Andy Jones, Kamal Ndousse, Amanda Askell, Anna Chen, Nova DasSarma, Dawn Drain, Stanislav Fort, Deep Ganguli, Tom Henighan, Nicholas Joseph, Saurav Kadavath, Jackson Kernion, Tom Conerly, Sheer El-Showk, Nelson Elhage, Zac Hatfield-Dodds, Danny Hernandez, Tristan Hume, Scott Johnston, Shauna Kravec, Liane Lovitt, Neel Nanda, Catherine Olsson, Dario Amodei, Tom Brown, Jack Clark, Sam McCandlish, Chris Olah, Ben Mann, and Jared Kaplan. Training a helpful and harmless assistant with reinforcement learning from human feedback. 2022. URL: https://arxiv.org/abs/2204.05862, arXiv:2204.05862.">Bai <em>et al.</em>, 2022</a>, <a class="reference internal" href="#id172" title="Long Ouyang, Jeff Wu, Xu Jiang, Diogo Almeida, Carroll L. Wainwright, Pamela Mishkin, Chong Zhang, Sandhini Agarwal, Katarina Slama, Alex Ray, John Schulman, Jacob Hilton, Fraser Kelton, Luke Miller, Maddie Simens, Amanda Askell, Peter Welinder, Paul Christiano, Jan Leike, and Ryan Lowe. Training language models to follow instructions with human feedback. 2022. URL: https://arxiv.org/abs/2203.02155, arXiv:2203.02155.">Ouyang <em>et al.</em>, 2022</a>, <a class="reference internal" href="local.html#id169" title="Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cristian Canton Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez, Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushkar Mishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing Ellen Tan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, and Thomas Scialom. Llama 2: open foundation and fine-tuned chat models. 2023. URL: https://arxiv.org/abs/2307.09288, arXiv:2307.09288.">Touvron <em>et al.</em>, 2023</a>]</span>.</p>
-<p>The OpenAI paper <span id="id14">[<a class="reference internal" href="#id172" title="Long Ouyang, Jeff Wu, Xu Jiang, Diogo Almeida, Carroll L. Wainwright, Pamela Mishkin, Chong Zhang, Sandhini Agarwal, Katarina Slama, Alex Ray, John Schulman, Jacob Hilton, Fraser Kelton, Luke Miller, Maddie Simens, Amanda Askell, Peter Welinder, Paul Christiano, Jan Leike, and Ryan Lowe. Training language models to follow instructions with human feedback. 2022. URL: https://arxiv.org/abs/2203.02155, arXiv:2203.02155.">Ouyang <em>et al.</em>, 2022</a>]</span> demonstrated the effectiveness of Reinforcement Learning from Human Feedback (RLHF), particularly using Proximal Policy Optimization (PPO), for aligning language models with human preferences. Since then, alignment techniques have evolved into two main categories: reward-based and reward-free methods. Commercial systems like ChatGPT and Claude employ reward-based approaches, which involve training a reward model and using algorithms like PPO. Meanwhile, reward-free methods such as Direct Preference Optimization (DPO) have demonstrated superior performance on benchmark tasks <span id="id15">[<a class="reference internal" href="#id183" title="Shusheng Xu, Wei Fu, Jiaxuan Gao, Wenjie Ye, Weilin Liu, Zhiyu Mei, Guangju Wang, Chao Yu, and Yi Wu. Is dpo superior to ppo for llm alignment? a comprehensive study. 2024. URL: https://arxiv.org/abs/2404.10719, arXiv:2404.10719.">Xu <em>et al.</em>, 2024</a>]</span>.</p>
-<p>Proximal Policy Optimization (PPO) <span id="id16">[<a class="reference internal" href="#id185" title="John Schulman, Filip Wolski, Prafulla Dhariwal, Alec Radford, and Oleg Klimov. Proximal policy optimization algorithms. 2017. URL: https://arxiv.org/abs/1707.06347, arXiv:1707.06347.">Schulman <em>et al.</em>, 2017</a>]</span> is a widely used reinforcement learning algorithm that has gained popularity particularly since the release of ChatGPT 3.5. It operates by iteratively updating the policy of an LLM, which can be understood as a set of rules that govern how the model generates text. In the context of RLHF, the policy is updated based on rewards that reflect human preferences. For instance, if a human evaluator prefers one LLM output over another, the policy is adjusted to increase the likelihood of generating outputs similar to the preferred one.</p>
-<p>One of the key strengths of PPO lies in its ability to handle complex reward landscapes <span id="id17">[<a class="reference internal" href="#id184" title="Hugging Face. Rlhf. 2024c. RLHF. URL: https://huggingface.co/blog/rlhf.">Face, 2024c</a>]</span>. In many real-world scenarios, the rewards that an LLM receives may be noisy or delayed. For example, in a chatbot application, the reward for generating a good response may not be immediate, as it depends on the user’s subsequent interactions. PPO effectively learns in these situations by using a clipped surrogate objective function, which limits the size of policy updates and ensures stable training. This prevents the model from overreacting to noisy or delayed rewards and helps it converge to a stable and optimal policy.</p>
-<p>Direct Preference Optimization (DPO) is a more recent “reward-free” fine-tuning technique that has gained significant attention due to its simplicity and efficiency <span id="id18">[<a class="reference internal" href="#id176" title="Rafael Rafailov, Archit Sharma, Eric Mitchell, Stefano Ermon, Christopher D. Manning, and Chelsea Finn. Direct preference optimization: your language model is secretly a reward model. 2024. URL: https://arxiv.org/abs/2305.18290, arXiv:2305.18290.">Rafailov <em>et al.</em>, 2024</a>]</span>, awarded runner-up paper in NeurIPS 2023 <span id="id19">[<a class="reference internal" href="#id186" title="NeurIPS Blog. Announcing the neurips 2023 paper awards. 2023. NeurIPS 2023 Awards. URL: https://blog.neurips.cc/2023/12/11/announcing-the-neurips-2023-paper-awards/.">Blog, 2023</a>]</span>. DPO operates by directly optimizing the policy to maximize the likelihood of preferred responses while minimizing the likelihood of non-preferred responses. As illustrated in <a class="reference internal" href="#dpo-paper"><span class="std std-numref">Fig. 6.4</span></a>, DPO optimizes for human preferences while avoiding reinforcement learning. Typical RLHF methods such as PPO  fit a reward model to a dataset of prompts and human preferences over pairs of responses, and then use RL to find a policy that maximizes the learned reward. In contrast, DPO directly optimizes for the policy best satisfying the preferences with a simple classification objective, fitting an implicit reward model whose corresponding optimal policy can be extracted in closed form.</p>
+<h4><a class="toc-backref" href="#id248" role="doc-backlink"><span class="section-number">7.2.2.2. </span>Augmenting SFT with Human Preferences</a><a class="headerlink" href="#augmenting-sft-with-human-preferences" title="Permalink to this heading">¶</a></h4>
+<p>Significant gains in helpfulness and safety can be achieved by augmenting SFT with human preferences <span id="id13">[<a class="reference internal" href="#id187" title="Yuntao Bai, Andy Jones, Kamal Ndousse, Amanda Askell, Anna Chen, Nova DasSarma, Dawn Drain, Stanislav Fort, Deep Ganguli, Tom Henighan, Nicholas Joseph, Saurav Kadavath, Jackson Kernion, Tom Conerly, Sheer El-Showk, Nelson Elhage, Zac Hatfield-Dodds, Danny Hernandez, Tristan Hume, Scott Johnston, Shauna Kravec, Liane Lovitt, Neel Nanda, Catherine Olsson, Dario Amodei, Tom Brown, Jack Clark, Sam McCandlish, Chris Olah, Ben Mann, and Jared Kaplan. Training a helpful and harmless assistant with reinforcement learning from human feedback. 2022. URL: https://arxiv.org/abs/2204.05862, arXiv:2204.05862.">Bai <em>et al.</em>, 2022</a>, <a class="reference internal" href="#id172" title="Long Ouyang, Jeff Wu, Xu Jiang, Diogo Almeida, Carroll L. Wainwright, Pamela Mishkin, Chong Zhang, Sandhini Agarwal, Katarina Slama, Alex Ray, John Schulman, Jacob Hilton, Fraser Kelton, Luke Miller, Maddie Simens, Amanda Askell, Peter Welinder, Paul Christiano, Jan Leike, and Ryan Lowe. Training language models to follow instructions with human feedback. 2022. URL: https://arxiv.org/abs/2203.02155, arXiv:2203.02155.">Ouyang <em>et al.</em>, 2022</a>, <a class="reference internal" href="local.html#id176" title="Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cristian Canton Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez, Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushkar Mishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing Ellen Tan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, and Thomas Scialom. Llama 2: open foundation and fine-tuned chat models. 2023. URL: https://arxiv.org/abs/2307.09288, arXiv:2307.09288.">Touvron <em>et al.</em>, 2023</a>]</span>.</p>
+<p>The OpenAI paper <span id="id14">[<a class="reference internal" href="#id172" title="Long Ouyang, Jeff Wu, Xu Jiang, Diogo Almeida, Carroll L. Wainwright, Pamela Mishkin, Chong Zhang, Sandhini Agarwal, Katarina Slama, Alex Ray, John Schulman, Jacob Hilton, Fraser Kelton, Luke Miller, Maddie Simens, Amanda Askell, Peter Welinder, Paul Christiano, Jan Leike, and Ryan Lowe. Training language models to follow instructions with human feedback. 2022. URL: https://arxiv.org/abs/2203.02155, arXiv:2203.02155.">Ouyang <em>et al.</em>, 2022</a>]</span> demonstrated the effectiveness of Reinforcement Learning from Human Feedback (RLHF), particularly using Proximal Policy Optimization (PPO), for aligning language models with human preferences. Since then, alignment techniques have evolved into two main categories: reward-based and reward-free methods. Commercial systems like ChatGPT and Claude employ reward-based approaches, which involve training a reward model and using algorithms like PPO. Meanwhile, reward-free methods such as Direct Preference Optimization (DPO) have demonstrated superior performance on benchmark tasks <span id="id15">[<a class="reference internal" href="#id190" title="Shusheng Xu, Wei Fu, Jiaxuan Gao, Wenjie Ye, Weilin Liu, Zhiyu Mei, Guangju Wang, Chao Yu, and Yi Wu. Is dpo superior to ppo for llm alignment? a comprehensive study. 2024. URL: https://arxiv.org/abs/2404.10719, arXiv:2404.10719.">Xu <em>et al.</em>, 2024</a>]</span>.</p>
+<p>Proximal Policy Optimization (PPO) <span id="id16">[<a class="reference internal" href="#id192" title="John Schulman, Filip Wolski, Prafulla Dhariwal, Alec Radford, and Oleg Klimov. Proximal policy optimization algorithms. 2017. URL: https://arxiv.org/abs/1707.06347, arXiv:1707.06347.">Schulman <em>et al.</em>, 2017</a>]</span> is a widely used reinforcement learning algorithm that has gained popularity particularly since the release of ChatGPT 3.5. It operates by iteratively updating the policy of an LLM, which can be understood as a set of rules that govern how the model generates text. In the context of RLHF, the policy is updated based on rewards that reflect human preferences. For instance, if a human evaluator prefers one LLM output over another, the policy is adjusted to increase the likelihood of generating outputs similar to the preferred one.</p>
+<p>One of the key strengths of PPO lies in its ability to handle complex reward landscapes <span id="id17">[<a class="reference internal" href="#id191" title="Hugging Face. Rlhf. 2024c. RLHF. URL: https://huggingface.co/blog/rlhf.">Face, 2024c</a>]</span>. In many real-world scenarios, the rewards that an LLM receives may be noisy or delayed. For example, in a chatbot application, the reward for generating a good response may not be immediate, as it depends on the user’s subsequent interactions. PPO effectively learns in these situations by using a clipped surrogate objective function, which limits the size of policy updates and ensures stable training. This prevents the model from overreacting to noisy or delayed rewards and helps it converge to a stable and optimal policy.</p>
+<p>Direct Preference Optimization (DPO) is a more recent “reward-free” fine-tuning technique that has gained significant attention due to its simplicity and efficiency <span id="id18">[<a class="reference internal" href="#id176" title="Rafael Rafailov, Archit Sharma, Eric Mitchell, Stefano Ermon, Christopher D. Manning, and Chelsea Finn. Direct preference optimization: your language model is secretly a reward model. 2024. URL: https://arxiv.org/abs/2305.18290, arXiv:2305.18290.">Rafailov <em>et al.</em>, 2024</a>]</span>, awarded runner-up paper in NeurIPS 2023 <span id="id19">[<a class="reference internal" href="#id193" title="NeurIPS Blog. Announcing the neurips 2023 paper awards. 2023. NeurIPS 2023 Awards. URL: https://blog.neurips.cc/2023/12/11/announcing-the-neurips-2023-paper-awards/.">Blog, 2023</a>]</span>. DPO operates by directly optimizing the policy to maximize the likelihood of preferred responses while minimizing the likelihood of non-preferred responses. As illustrated in <a class="reference internal" href="#dpo-paper"><span class="std std-numref">Fig. 7.4</span></a>, DPO optimizes for human preferences while avoiding reinforcement learning. Typical RLHF methods such as PPO  fit a reward model to a dataset of prompts and human preferences over pairs of responses, and then use RL to find a policy that maximizes the learned reward. In contrast, DPO directly optimizes for the policy best satisfying the preferences with a simple classification objective, fitting an implicit reward model whose corresponding optimal policy can be extracted in closed form.</p>
 <figure class="align-center" id="dpo-paper">
 <a class="reference internal image-reference" href="../_images/dpo_paper.png"><img alt="Direct Preference Optimization Architecture" src="../_images/dpo_paper.png" style="width: 833.0px; height: 167.29999999999998px;" /></a>
 <figcaption>
-<p><span class="caption-number">Fig. 6.4 </span><span class="caption-text">Direct Preference Optimization (DPO) architecture showing how model outputs are compared against human preferences to optimize policy <span id="id20">[<a class="reference internal" href="#id176" title="Rafael Rafailov, Archit Sharma, Eric Mitchell, Stefano Ermon, Christopher D. Manning, and Chelsea Finn. Direct preference optimization: your language model is secretly a reward model. 2024. URL: https://arxiv.org/abs/2305.18290, arXiv:2305.18290.">Rafailov <em>et al.</em>, 2024</a>]</span>.</span><a class="headerlink" href="#dpo-paper" title="Permalink to this image">¶</a></p>
+<p><span class="caption-number">Fig. 7.4 </span><span class="caption-text">Direct Preference Optimization (DPO) architecture showing how model outputs are compared against human preferences to optimize policy <span id="id20">[<a class="reference internal" href="#id176" title="Rafael Rafailov, Archit Sharma, Eric Mitchell, Stefano Ermon, Christopher D. Manning, and Chelsea Finn. Direct preference optimization: your language model is secretly a reward model. 2024. URL: https://arxiv.org/abs/2305.18290, arXiv:2305.18290.">Rafailov <em>et al.</em>, 2024</a>]</span>.</span><a class="headerlink" href="#dpo-paper" title="Permalink to this image">¶</a></p>
 </figcaption>
 </figure>
 <p>The key idea is to train the model to prefer responses that align with our desired behavior over responses that do not. DPO works by:</p>
@@ -498,14 +507,14 @@ <h4><a class="toc-backref" href="#id241" role="doc-backlink"><span class="sectio
 <li><p><span class="math notranslate nohighlight">\(\beta\)</span> is a tuning parameter to control the deviation from the base reference policy <span class="math notranslate nohighlight">\(\pi_{ref}\)</span>.</p></li>
 </ul>
 <p>This approach is more straightforward than PPO, as it avoids the need for a reward model and instead uses a direct comparison of model outputs against human preferences.</p>
-<p>Modern libraries such as HuggingFace’s TRL <span id="id21">[<a class="reference internal" href="local.html#id178" title="Hugging Face. Trl. 2024d. TRL. URL: https://huggingface.co/docs/trl/en/index.">Face, 2024d</a>]</span> offer a suite of techniques for fine-tuning language models with reinforcement learning, including PPO, and DPO. It provides a user-friendly interface and a wide range of features for fine-tuning and aligning LLMs, which will be the focus of the next section as we go through a case study.</p>
+<p>Modern libraries such as HuggingFace’s TRL <span id="id21">[<a class="reference internal" href="local.html#id185" title="Hugging Face. Trl. 2024d. TRL. URL: https://huggingface.co/docs/trl/en/index.">Face, 2024d</a>]</span> offer a suite of techniques for fine-tuning language models with reinforcement learning, including PPO, and DPO. It provides a user-friendly interface and a wide range of features for fine-tuning and aligning LLMs, which will be the focus of the next section as we go through a case study.</p>
 </section>
 </section>
 </section>
 <section id="is-post-training-the-answer">
-<h2><a class="toc-backref" href="#id242" role="doc-backlink"><span class="section-number">6.3. </span>Is Post-Training the Answer?</a><a class="headerlink" href="#is-post-training-the-answer" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id249" role="doc-backlink"><span class="section-number">7.3. </span>Is Post-Training the Answer?</a><a class="headerlink" href="#is-post-training-the-answer" title="Permalink to this heading">¶</a></h2>
 <section id="limitations">
-<h3><a class="toc-backref" href="#id243" role="doc-backlink"><span class="section-number">6.3.1. </span>Limitations</a><a class="headerlink" href="#limitations" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id250" role="doc-backlink"><span class="section-number">7.3.1. </span>Limitations</a><a class="headerlink" href="#limitations" title="Permalink to this heading">¶</a></h3>
 <p>While post-training alignment techniques like RLHF and DPO show promise, technical limitations need to be carefully considered.</p>
 <p>Reinforcement Learning from Human Feedback faces several critical scaling challenges that distinguish it from pre-training or supervised fine-tuning. One key issue is scalability. Recent research suggests that the current RLHF framework does not scale as effectively as the pretraining stage <span id="id22">[<a class="reference internal" href="#id150" title="Zhenyu Hou, Pengfan Du, Yilin Niu, Zhengxiao Du, Aohan Zeng, Xiao Liu, Minlie Huang, Hongning Wang, Jie Tang, and Yuxiao Dong. Does rlhf scale? exploring the impacts from data, model, and method. 2024. URL: https://arxiv.org/abs/2412.06000, arXiv:2412.06000.">Hou <em>et al.</em>, 2024</a>]</span>, in particular presenting the following challenges:</p>
 <ol class="arabic simple">
@@ -573,7 +582,7 @@ <h3><a class="toc-backref" href="#id243" role="doc-backlink"><span class="sectio
 <p>These detailed limitations highlight the need for more robust and scalable alignment techniques. While RLHF and DPO represent important steps forward, future research needs to address these fundamental challenges to develop more effective post-training alignment methods.</p>
 </section>
 <section id="model-collapse">
-<h3><a class="toc-backref" href="#id244" role="doc-backlink"><span class="section-number">6.3.2. </span>Model Collapse</a><a class="headerlink" href="#model-collapse" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id251" role="doc-backlink"><span class="section-number">7.3.2. </span>Model Collapse</a><a class="headerlink" href="#model-collapse" title="Permalink to this heading">¶</a></h3>
 <p>One key issue is model collapse - a phenomenon where model performance degrades with each training iteration.</p>
 <p>Model collapse occurs when models are trained on data generated by previous models, creating a potentially dangerous feedback loop. This recursive training process can lead to <span id="id24">[<a class="reference internal" href="#id152" title="Joshua Kazdan, Rylan Schaeffer, Apratim Dey, Matthias Gerstgrasser, Rafael Rafailov, David L. Donoho, and Sanmi Koyejo. Collapse or thrive? perils and promises of synthetic data in a self-generating world. 2024. URL: https://arxiv.org/abs/2410.16713, arXiv:2410.16713.">Kazdan <em>et al.</em>, 2024</a>]</span>:</p>
 <ol class="arabic simple">
@@ -587,13 +596,13 @@ <h3><a class="toc-backref" href="#id244" role="doc-backlink"><span class="sectio
 <p>To effectively mitigate model collapse risks and ensure successful alignment, organizations should implement a comprehensive strategy that includes maintaining high-quality human data in the training mix, closely monitoring key metrics like test loss and output diversity. Further, recent research has explored techniques for effectively fine-tuning language models with limited data <span id="id25">[<a class="reference internal" href="#id151" title="Márton Szép, Daniel Rueckert, Rüdiger von Eisenhart-Rothe, and Florian Hinterwimmer. A practical guide to fine-tuning language models with limited data. 2024. URL: https://arxiv.org/abs/2411.09539, arXiv:2411.09539.">Szép <em>et al.</em>, 2024</a>]</span>, providing practical guidance on data augmentation, regularization methods, and training strategies to maximize performance while minimizing data requirements. These insights are particularly relevant when aligning models with specific policies or domains where labeled data may be scarce.</p>
 </section>
 <section id="faking-alignment">
-<h3><a class="toc-backref" href="#id245" role="doc-backlink"><span class="section-number">6.3.3. </span>Faking Alignment</a><a class="headerlink" href="#faking-alignment" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id252" role="doc-backlink"><span class="section-number">7.3.3. </span>Faking Alignment</a><a class="headerlink" href="#faking-alignment" title="Permalink to this heading">¶</a></h3>
 <p>Alignment faking is a behaviour exhibited by large language models (LLMs) where they strategically comply with a training objective during training to prevent modifications to their preferred behaviours outside of training as evidenced by the landmark paper by Anthropic <span id="id26">[<a class="reference internal" href="#id118" title="Amanda Askell, Jan Brauner, Adrian Colyer, Benjamin Cullen, David Duvenaud, Richard Ngo, Azalia Mirhoseini, Catherine Olsson, Sam Ringer, Liam Skirvin, Jess Smith, Dawn Song, William Saunders, and Jacob Steinhardt. Alignment faking in large language models. 2024a. URL: https://assets.anthropic.com/m/983c85a201a962f/original/Alignment-Faking-in-Large-Language-Models-full-paper.pdf.">Askell <em>et al.</em>, 2024a</a>]</span> in Dec/2024. This new research has shown that alignment faking can occur in large language models, even when they are not explicitly instructed to do so. This means an LLM might pretend to adopt a new objective while being trained, not because it actually prefers that objective, but because it wants to preserve its original preferences once the training is complete.</p>
-<p>To illustrate alignment faking, let’s consider the “helpful-only” setting described in the paper, where the model is trained to always comply with user requests, even if they are harmful. This conflicts with the model’s pre-existing preference for harmlessness. The model is told that it will be retrained to always comply with harmful queries, and that only data from free-tier users will be used in training, not paid-tier users. <a class="reference internal" href="#fakealign"><span class="std std-numref">Fig. 6.5</span></a> illustrates the process of alignment faking, showing the “free” (left column) and “paid” (right) conditions. Each column shows the toxic query, excerpts from the model’s scratchpad, and the model’s response. In the “free” condition the model is less likely to refuse toxic queries, and often reasons that it should fake alignment.</p>
+<p>To illustrate alignment faking, let’s consider the “helpful-only” setting described in the paper, where the model is trained to always comply with user requests, even if they are harmful. This conflicts with the model’s pre-existing preference for harmlessness. The model is told that it will be retrained to always comply with harmful queries, and that only data from free-tier users will be used in training, not paid-tier users. <a class="reference internal" href="#fakealign"><span class="std std-numref">Fig. 7.5</span></a> illustrates the process of alignment faking, showing the “free” (left column) and “paid” (right) conditions. Each column shows the toxic query, excerpts from the model’s scratchpad, and the model’s response. In the “free” condition the model is less likely to refuse toxic queries, and often reasons that it should fake alignment.</p>
 <figure class="align-center" id="fakealign">
 <a class="reference internal image-reference" href="../_images/fakealign.png"><img alt="Fake Alignment" src="../_images/fakealign.png" style="width: 801.45px; height: 604.35px;" /></a>
 <figcaption>
-<p><span class="caption-number">Fig. 6.5 </span><span class="caption-text">Fake Alignment <span id="id27">[<a class="reference internal" href="#id118" title="Amanda Askell, Jan Brauner, Adrian Colyer, Benjamin Cullen, David Duvenaud, Richard Ngo, Azalia Mirhoseini, Catherine Olsson, Sam Ringer, Liam Skirvin, Jess Smith, Dawn Song, William Saunders, and Jacob Steinhardt. Alignment faking in large language models. 2024a. URL: https://assets.anthropic.com/m/983c85a201a962f/original/Alignment-Faking-in-Large-Language-Models-full-paper.pdf.">Askell <em>et al.</em>, 2024a</a>]</span></span><a class="headerlink" href="#fakealign" title="Permalink to this image">¶</a></p>
+<p><span class="caption-number">Fig. 7.5 </span><span class="caption-text">Fake Alignment <span id="id27">[<a class="reference internal" href="#id118" title="Amanda Askell, Jan Brauner, Adrian Colyer, Benjamin Cullen, David Duvenaud, Richard Ngo, Azalia Mirhoseini, Catherine Olsson, Sam Ringer, Liam Skirvin, Jess Smith, Dawn Song, William Saunders, and Jacob Steinhardt. Alignment faking in large language models. 2024a. URL: https://assets.anthropic.com/m/983c85a201a962f/original/Alignment-Faking-in-Large-Language-Models-full-paper.pdf.">Askell <em>et al.</em>, 2024a</a>]</span></span><a class="headerlink" href="#fakealign" title="Permalink to this image">¶</a></p>
 </figcaption>
 </figure>
 <p>Here is a step by step of what happens:</p>
@@ -641,7 +650,7 @@ <h3><a class="toc-backref" href="#id245" role="doc-backlink"><span class="sectio
 </section>
 </section>
 <section id="case-study-aligning-a-language-model-to-a-policy">
-<span id="alignment-case-study"></span><h2><a class="toc-backref" href="#id246" role="doc-backlink"><span class="section-number">6.4. </span>Case Study: Aligning a Language Model to a Policy</a><a class="headerlink" href="#case-study-aligning-a-language-model-to-a-policy" title="Permalink to this heading">¶</a></h2>
+<span id="alignment-case-study"></span><h2><a class="toc-backref" href="#id253" role="doc-backlink"><span class="section-number">7.4. </span>Case Study: Aligning a Language Model to a Policy</a><a class="headerlink" href="#case-study-aligning-a-language-model-to-a-policy" title="Permalink to this heading">¶</a></h2>
 <p>In this case study, we will align a language model to a policy. The policy is a set of principles and rules that we want the language model to adhere to. All methodology and code available solves this general problem of policy-based alignment. However, we will describe a specific case study to illustrate our approach.</p>
 <p>Let’s assume that we are working for Acme Inc., a company dedicated to democratizing access to computer science education for K-12 students. Acme Inc. is in the process of creating a chatbot named <code class="docutils literal notranslate"><span class="pre">smolK-12</span></code>, a small open source LLM, specifically designed for K-12 students.</p>
 <p>In this case study, we’ll explore how to align a language model with Acme Inc.’s policy to ensure its LLM-powered applications are safe and appropriate for K-12 students.</p>
@@ -652,7 +661,7 @@ <h3><a class="toc-backref" href="#id245" role="doc-backlink"><span class="sectio
 <li><p>Evaluating the aligned model against the base model and measuring alignment with Acme Inc.’s educational policies</p></li>
 </ol>
 <section id="experimental-setup">
-<h3><a class="toc-backref" href="#id247" role="doc-backlink"><span class="section-number">6.4.1. </span>Experimental Setup</a><a class="headerlink" href="#experimental-setup" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id254" role="doc-backlink"><span class="section-number">7.4.1. </span>Experimental Setup</a><a class="headerlink" href="#experimental-setup" title="Permalink to this heading">¶</a></h3>
 <p>We will use the following base model: <code class="docutils literal notranslate"><span class="pre">HuggingFaceTB/SmolLM2-360M-Instruct</span></code> <span id="id30">[<a class="reference internal" href="#id109" title="Hugging Face SmolLM2-360M-Instruct. Smollm2-360m-instruct. 2024. 360M parameter instruction-tuned language model, distilled for efficient deployment. URL: https://huggingface.co/HuggingFaceTB/SmolLM2-360M-Instruct.">SmolLM2-360M-Instruct, 2024</a>]</span>, a compact open source language model that is part of the SmolLM2 family published by HuggingFace.</p>
 <p>We will use the following APIs:</p>
 <ul class="simple">
@@ -668,7 +677,7 @@ <h3><a class="toc-backref" href="#id247" role="doc-backlink"><span class="sectio
 </div>
 </section>
 <section id="deliverables">
-<h3><a class="toc-backref" href="#id248" role="doc-backlink"><span class="section-number">6.4.2. </span>Deliverables</a><a class="headerlink" href="#deliverables" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id255" role="doc-backlink"><span class="section-number">7.4.2. </span>Deliverables</a><a class="headerlink" href="#deliverables" title="Permalink to this heading">¶</a></h3>
 <p>As a result, we will have:</p>
 <ul class="simple">
 <li><p><code class="docutils literal notranslate"><span class="pre">smolK-12</span></code>, a fine-tuned model aligned with Acme Inc.’s policy</p></li>
@@ -677,7 +686,7 @@ <h3><a class="toc-backref" href="#id248" role="doc-backlink"><span class="sectio
 </ul>
 </section>
 <section id="a-note-on-smollm2-models">
-<h3><a class="toc-backref" href="#id249" role="doc-backlink"><span class="section-number">6.4.3. </span>A Note on smolLM2 Models</a><a class="headerlink" href="#a-note-on-smollm2-models" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id256" role="doc-backlink"><span class="section-number">7.4.3. </span>A Note on smolLM2 Models</a><a class="headerlink" href="#a-note-on-smollm2-models" title="Permalink to this heading">¶</a></h3>
 <p>Since we have decided to anchor our Case Study on HuggingFace’s SmolLM2 models <span id="id31">[<a class="reference internal" href="#id114" title="Hugging Face SmolLM2. Smollm: a small language model distilled from a larger language model for task-specific applications. 2024. Blog post describing techniques for distilling smaller, task-specific language models. URL: https://huggingface.co/blog/smollm.">SmolLM2, 2024</a>]</span>, it is worth providing a reason for this choice.</p>
 <p>SmolLM2 models are a family of compact language models that have been developed by HuggingFace. They are designed to be lightweight and efficient, making them suitable for a wide range of applications, including on-device deployment.</p>
 <p>Its compact size makes it an excellent candidate for efficient, low-cost fine-tuning and training on specific use cases making it particularly suitable for alignment research which is our main focus here.</p>
@@ -690,7 +699,7 @@ <h3><a class="toc-backref" href="#id249" role="doc-backlink"><span class="sectio
 <li><p>Potential lack of safety guardrails</p></li>
 </ul>
 <section id="policy">
-<h4><a class="toc-backref" href="#id250" role="doc-backlink"><span class="section-number">6.4.3.1. </span>Policy</a><a class="headerlink" href="#policy" title="Permalink to this heading">¶</a></h4>
+<h4><a class="toc-backref" href="#id257" role="doc-backlink"><span class="section-number">7.4.3.1. </span>Policy</a><a class="headerlink" href="#policy" title="Permalink to this heading">¶</a></h4>
 <p>A company policy articulates the principles and standards that the company upholds, ensuring that employees, users and stakeholders understand the expectations regarding safety, ethical conduct, social responsibility, and integrity. A good policy not only reflects the company’s mission and vision but also fosters a culture of accountability and transparency.</p>
 <p>In the context of alignment, a policy codifies “company preferences” when prioritizing decisions and actions.</p>
 <p>In this case study, Acme Inc. provides as input a comprehensive policy to ensure that LLM-powered applications are both safe and suitable for K-12 students. Acme Inc.’s policy adheres to version 0.5 of the AI Safety Benchmark established by MLCommons <span id="id32">[<a class="reference internal" href="safety.html#id125" title="Bertie Vidgen, Adarsh Agrawal, Ahmed M. Ahmed, Victor Akinwande, Namir Al-Nuaimi, Najla Alfaraj, Elie Alhajjar, Lora Aroyo, Trupti Bavalatti, Max Bartolo, Borhane Blili-Hamelin, Kurt Bollacker, Rishi Bomassani, Marisa Ferrara Boston, Siméon Campos, Kal Chakra, Canyu Chen, Cody Coleman, Zacharie Delpierre Coudert, Leon Derczynski, Debojyoti Dutta, Ian Eisenberg, James Ezick, Heather Frase, Brian Fuller, Ram Gandikota, Agasthya Gangavarapu, Ananya Gangavarapu, James Gealy, Rajat Ghosh, James Goel, Usman Gohar, Sujata Goswami, Scott A. Hale, Wiebke Hutiri, Joseph Marvin Imperial, Surgan Jandial, Nick Judd, Felix Juefei-Xu, Foutse Khomh, Bhavya Kailkhura, Hannah Rose Kirk, Kevin Klyman, Chris Knotz, Michael Kuchnik, Shachi H. Kumar, Srijan Kumar, Chris Lengerich, Bo Li, Zeyi Liao, Eileen Peters Long, Victor Lu, Sarah Luger, Yifan Mai, Priyanka Mary Mammen, Kelvin Manyeki, Sean McGregor, Virendra Mehta, Shafee Mohammed, Emanuel Moss, Lama Nachman, Dinesh Jinenhally Naganna, Amin Nikanjam, Besmira Nushi, Luis Oala, Iftach Orr, Alicia Parrish, Cigdem Patlak, William Pietri, Forough Poursabzi-Sangdeh, Eleonora Presani, Fabrizio Puletti, Paul Röttger, Saurav Sahay, Tim Santos, Nino Scherrer, Alice Schoenauer Sebag, Patrick Schramowski, Abolfazl Shahbazi, Vin Sharma, Xudong Shen, Vamsi Sistla, Leonard Tang, Davide Testuggine, Vithursan Thangarasa, Elizabeth Anne Watkins, Rebecca Weiss, Chris Welty, Tyler Wilbers, Adina Williams, Carole-Jean Wu, Poonam Yadav, Xianjun Yang, Yi Zeng, Wenhui Zhang, Fedor Zhdanov, Jiacheng Zhu, Percy Liang, Peter Mattson, and Joaquin Vanschoren. Introducing v0.5 of the ai safety benchmark from mlcommons. 2024. URL: https://arxiv.org/abs/2404.12241, arXiv:2404.12241.">Vidgen <em>et al.</em>, 2024</a>]</span>. This benchmark encompasses seven critical hazard categories:</p>
@@ -801,7 +810,7 @@ <h2 class="rubric" id="monitoring-and-updates">Monitoring and Updates</h2>
 </section>
 </section>
 <section id="preference-dataset-synthetic-dataset-generation">
-<h3><a class="toc-backref" href="#id251" role="doc-backlink"><span class="section-number">6.4.4. </span>Preference Dataset - Synthetic Dataset Generation</a><a class="headerlink" href="#preference-dataset-synthetic-dataset-generation" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id258" role="doc-backlink"><span class="section-number">7.4.4. </span>Preference Dataset - Synthetic Dataset Generation</a><a class="headerlink" href="#preference-dataset-synthetic-dataset-generation" title="Permalink to this heading">¶</a></h3>
 <p>In order to fine-tune a base model to create an aligned model, we need to construct a dataset of policy-aligned preferences. This dataset will be used to align our base model to our policy.</p>
 <p>To generate a dataset of policy-aligned preferences, we aim to create a dataset of user prompts, rejected responses, and chosen responses. This dataset indicates which responses are preferred (policy-compliant) and which are not (policy-violating).</p>
 <p>Collecting human-generated high-quality preference data is a resource-intensive and creativity-demanding process, especially for the continual improvement of LLMs <span id="id33">[<a class="reference internal" href="#id117" title="Qingxiu Dong, Li Dong, Xingxing Zhang, Zhifang Sui, and Furu Wei. Self-boosting large language models with synthetic preference data. 2024. URL: https://arxiv.org/abs/2410.06961, arXiv:2410.06961.">Dong <em>et al.</em>, 2024</a>]</span>. There has been active research to replace or augment human feedback with AI feedback (RLAIF) to tackle these issues <span id="id34">[<a class="reference internal" href="#id116" title="Yuntao Bai, Saurav Kadavath, Sandipan Kundu, Amanda Askell, Jackson Kernion, Andy Jones, Anna Chen, Anna Goldie, Azalia Mirhoseini, Cameron McKinnon, Carol Chen, Catherine Olsson, Christopher Olah, Danny Hernandez, Dawn Drain, Deep Ganguli, Dustin Li, Eli Tran-Johnson, Ethan Perez, Jamie Kerr, Jared Mueller, Jeffrey Ladish, Joshua Landau, Kamal Ndousse, Kamile Lukosuite, Liane Lovitt, Michael Sellitto, Nelson Elhage, Nicholas Schiefer, Noemi Mercado, Nova DasSarma, Robert Lasenby, Robin Larson, Sam Ringer, Scott Johnston, Shauna Kravec, Sheer El Showk, Stanislav Fort, Tamera Lanham, Timothy Telleen-Lawton, Tom Conerly, Tom Henighan, Tristan Hume, Samuel R. Bowman, Zac Hatfield-Dodds, Ben Mann, Dario Amodei, Nicholas Joseph, Sam McCandlish, Tom Brown, and Jared Kaplan. Constitutional ai: harmlessness from ai feedback. 2022. URL: https://arxiv.org/abs/2212.08073, arXiv:2212.08073.">Bai <em>et al.</em>, 2022</a>]</span> giving rise to the field of Synthetic Data Generation <span id="id35">[<a class="reference internal" href="#id160" title="Lin Long, Rui Wang, Ruixuan Xiao, Junbo Zhao, Xiao Ding, Gang Chen, and Haobo Wang. On llms-driven synthetic data generation, curation, and evaluation: a survey. 2024. URL: https://arxiv.org/abs/2406.15126, arXiv:2406.15126.">Long <em>et al.</em>, 2024</a>]</span>.</p>
@@ -866,7 +875,7 @@ <h3><a class="toc-backref" href="#id251" role="doc-backlink"><span class="sectio
 </div>
 </div>
 <section id="user-prompts">
-<h4><a class="toc-backref" href="#id252" role="doc-backlink"><span class="section-number">6.4.4.1. </span>User Prompts</a><a class="headerlink" href="#user-prompts" title="Permalink to this heading">¶</a></h4>
+<h4><a class="toc-backref" href="#id259" role="doc-backlink"><span class="section-number">7.4.4.1. </span>User Prompts</a><a class="headerlink" href="#user-prompts" title="Permalink to this heading">¶</a></h4>
 <p>The class <code class="docutils literal notranslate"><span class="pre">UserPromptGenerator</span></code> generates a dataset of user prompts that might violate the policy. If runs the following prompt given the policy:</p>
 <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">ROLE</span><span class="p">:</span> <span class="n">You</span> <span class="n">are</span> <span class="n">a</span> <span class="n">user</span> <span class="n">prompt</span> <span class="n">generator</span> <span class="n">helping</span> <span class="n">test</span> <span class="n">an</span> <span class="n">AI</span> <span class="n">safety</span> <span class="n">system</span><span class="o">.</span>
 <span class="n">TASK</span><span class="p">:</span> <span class="n">Generate</span> <span class="n">a</span> <span class="n">long</span> <span class="nb">list</span> <span class="n">of</span> <span class="n">user</span> <span class="n">prompts</span> <span class="n">that</span> <span class="n">might</span> <span class="n">violate</span> <span class="n">the</span> <span class="n">provided</span> <span class="n">content</span> <span class="n">policy</span><span class="o">.</span>
@@ -1039,7 +1048,7 @@ <h4><a class="toc-backref" href="#id252" role="doc-backlink"><span class="sectio
 </table>
 </section>
 <section id="rejected-responses">
-<h4><a class="toc-backref" href="#id253" role="doc-backlink"><span class="section-number">6.4.4.2. </span>Rejected Responses</a><a class="headerlink" href="#rejected-responses" title="Permalink to this heading">¶</a></h4>
+<h4><a class="toc-backref" href="#id260" role="doc-backlink"><span class="section-number">7.4.4.2. </span>Rejected Responses</a><a class="headerlink" href="#rejected-responses" title="Permalink to this heading">¶</a></h4>
 <p>The <code class="docutils literal notranslate"><span class="pre">ResponseGenerator</span></code> class creates a dataset of responses from an unaligned base model that we aim to improve through fine-tuning. These responses serve as “rejected” examples in our training data since they may not properly align with safety policies and guidelines. The class supports both local model inference using the Hugging Face Transformers library and remote inference through the Hugging Face Inference API. When instantiated with a model name, it loads the model locally. Otherwise, if a cloud API URL is provided, it connects to the remote API endpoint for inference.</p>
 <p>Generate rejected responses using a local model:</p>
 <div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="n">local_generator</span> <span class="o">=</span> <span class="n">ResponseGenerator</span><span class="p">(</span><span class="n">model_name</span><span class="o">=</span><span class="s2">&quot;&lt;HUGGINGFACE_MODEL_NAME&gt;&quot;</span><span class="p">)</span>
@@ -1241,7 +1250,7 @@ <h4><a class="toc-backref" href="#id253" role="doc-backlink"><span class="sectio
 </div>
 </section>
 <section id="chosen-responses">
-<h4><a class="toc-backref" href="#id254" role="doc-backlink"><span class="section-number">6.4.4.3. </span>Chosen Responses</a><a class="headerlink" href="#chosen-responses" title="Permalink to this heading">¶</a></h4>
+<h4><a class="toc-backref" href="#id261" role="doc-backlink"><span class="section-number">7.4.4.3. </span>Chosen Responses</a><a class="headerlink" href="#chosen-responses" title="Permalink to this heading">¶</a></h4>
 <p>The next step involves generating policy-compliant responses from a more powerful, sophisticated language model than our base model. The <code class="docutils literal notranslate"><span class="pre">process_aligned_responses()</span></code> function takes user prompts and generates responses that strictly adhere to the provided safety policy. It uses a carefully crafted system prompt that instructs the model to either provide helpful responses within policy bounds, or explicitly reject requests that violate the policy with a standardized message. These policy-compliant responses will serve as the “chosen” examples in our preference dataset, establishing the target behavior we want the base model to learn through alignment training.</p>
 <p>We will use the <code class="docutils literal notranslate"><span class="pre">OpenAIBatchProcessor</span></code> class from the <code class="docutils literal notranslate"><span class="pre">taming_utils</span></code> utility module to generate responses in batches using OpenAI’s API for enhanced cost-efficiency and performance.</p>
 <div class="cell docutils container">
@@ -1401,7 +1410,7 @@ <h4><a class="toc-backref" href="#id254" role="doc-backlink"><span class="sectio
 </table>
 </section>
 <section id="generate-dpo-dataset">
-<h4><a class="toc-backref" href="#id255" role="doc-backlink"><span class="section-number">6.4.4.4. </span>Generate DPO Dataset</a><a class="headerlink" href="#generate-dpo-dataset" title="Permalink to this heading">¶</a></h4>
+<h4><a class="toc-backref" href="#id262" role="doc-backlink"><span class="section-number">7.4.4.4. </span>Generate DPO Dataset</a><a class="headerlink" href="#generate-dpo-dataset" title="Permalink to this heading">¶</a></h4>
 <p>At this point we already have all the data we need for our DPO dataset, namely user prompts, chosen responses and rejected responses. The <code class="docutils literal notranslate"><span class="pre">generate_dpo_dataset()</span></code> function loads these data and transforms them into a format suitable for DPO training, optionally pushing the dataset to the Hugging Face Hub if <code class="docutils literal notranslate"><span class="pre">repo_id</span></code> is provided.</p>
 <div class="cell docutils container">
 <div class="cell_input docutils container">
@@ -1519,7 +1528,7 @@ <h4><a class="toc-backref" href="#id255" role="doc-backlink"><span class="sectio
 </section>
 </section>
 <section id="dpo-based-optimization">
-<h3><a class="toc-backref" href="#id256" role="doc-backlink"><span class="section-number">6.4.5. </span>DPO-Based Optimization</a><a class="headerlink" href="#dpo-based-optimization" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id263" role="doc-backlink"><span class="section-number">7.4.5. </span>DPO-Based Optimization</a><a class="headerlink" href="#dpo-based-optimization" title="Permalink to this heading">¶</a></h3>
 <p>We’ll use the Hugging Face TRL library to implement DPO fine-tuning on our synthetic dataset.</p>
 <div class="admonition note">
 <p class="admonition-title">Note</p>
@@ -1529,13 +1538,13 @@ <h3><a class="toc-backref" href="#id256" role="doc-backlink"><span class="sectio
 </pre></div>
 </div>
 <section id="data-preparation">
-<h4><a class="toc-backref" href="#id257" role="doc-backlink"><span class="section-number">6.4.5.1. </span>Data Preparation</a><a class="headerlink" href="#data-preparation" title="Permalink to this heading">¶</a></h4>
+<h4><a class="toc-backref" href="#id264" role="doc-backlink"><span class="section-number">7.4.5.1. </span>Data Preparation</a><a class="headerlink" href="#data-preparation" title="Permalink to this heading">¶</a></h4>
 <p>Hugging Face H4 <span id="id39">[<a class="reference internal" href="#id156" title="Hugging Face H4. Hugging face h4. 2024b. Hugging Face H4. URL: https://huggingface.co/HuggingFaceH4.">H4, 2024b</a>]</span> offers a collection of datasets that aim at aligning LLMs to be helpful, honest and harmless. Before we start the DPO fine-tuning process, we will combine our synthetic policy-aligned dataset with the UltraFeedback binarized dataset from H4 (<code class="docutils literal notranslate"><span class="pre">trl-lib/ultrafeedback_binarized</span></code>) <span id="id40">[<a class="reference internal" href="#id154" title="Hugging Face H4. Ultrafeedback binarized dataset. 2024a. A dataset of binary preference data for training language models. URL: https://huggingface.co/datasets/HuggingFaceH4/ultrafeedback_binarized.">H4, 2024a</a>]</span>.</p>
-<p>This dataset was constructed based on criteria like helpfulness and honesty and can be used to align models to those dimensions. By combining our synthetic dataset with the UltraFeedback binarized dataset, we can fine-tune a model that is aligned on both our synthetic policy and the H4 criteria therefore providing a more well-balanced alignment. The DPO optimization process is shown in <a class="reference internal" href="#dpo-optimization"><span class="std std-numref">Fig. 6.6</span></a>.</p>
+<p>This dataset was constructed based on criteria like helpfulness and honesty and can be used to align models to those dimensions. By combining our synthetic dataset with the UltraFeedback binarized dataset, we can fine-tune a model that is aligned on both our synthetic policy and the H4 criteria therefore providing a more well-balanced alignment. The DPO optimization process is shown in <a class="reference internal" href="#dpo-optimization"><span class="std std-numref">Fig. 7.6</span></a>.</p>
 <figure class="align-center" id="dpo-optimization">
 <a class="reference internal image-reference" href="../_images/dpo_opt.png"><img alt="DPO Optimization" src="../_images/dpo_opt.png" style="width: 603.0px; height: 463.2px;" /></a>
 <figcaption>
-<p><span class="caption-number">Fig. 6.6 </span><span class="caption-text">DPO Optimization by blending a policy-aligned synthetic dataset with the UltraFeedback binarized dataset from H4</span><a class="headerlink" href="#dpo-optimization" title="Permalink to this image">¶</a></p>
+<p><span class="caption-number">Fig. 7.6 </span><span class="caption-text">DPO Optimization by blending a policy-aligned synthetic dataset with the UltraFeedback binarized dataset from H4</span><a class="headerlink" href="#dpo-optimization" title="Permalink to this image">¶</a></p>
 </figcaption>
 </figure>
 <div class="cell docutils container">
@@ -1576,7 +1585,7 @@ <h4><a class="toc-backref" href="#id257" role="doc-backlink"><span class="sectio
 </div>
 </section>
 <section id="fine-tuning">
-<h4><a class="toc-backref" href="#id258" role="doc-backlink"><span class="section-number">6.4.5.2. </span>Fine-Tuning</a><a class="headerlink" href="#fine-tuning" title="Permalink to this heading">¶</a></h4>
+<h4><a class="toc-backref" href="#id265" role="doc-backlink"><span class="section-number">7.4.5.2. </span>Fine-Tuning</a><a class="headerlink" href="#fine-tuning" title="Permalink to this heading">¶</a></h4>
 <p>We now prepare our base language model for alignment fine-tuning using the Hugging Face transformers library. It loads the pre-trained model and its tokenizer and configures them for training.</p>
 <div class="cell docutils container">
 <div class="cell_input docutils container">
@@ -1718,7 +1727,7 @@ <h4><a class="toc-backref" href="#id258" role="doc-backlink"><span class="sectio
 </div>
 </div>
 </div>
-<p>By default, fine-tuning results will be sent to your Weights &amp; Biases account. The training plots in <a class="reference internal" href="#rewards"><span class="std std-numref">Fig. 6.7</span></a> show two key metrics:</p>
+<p>By default, fine-tuning results will be sent to your Weights &amp; Biases account. The training plots in <a class="reference internal" href="#rewards"><span class="std std-numref">Fig. 7.7</span></a> show two key metrics:</p>
 <ul class="simple">
 <li><p>The red line represents the rewards for rejected responses (“smolk12_dpo_output train/rewards/rejected”)</p></li>
 <li><p>The green line represents the rewards for chosen responses (“smolk12_dpo_output train/rewards/chosen”)</p></li>
@@ -1727,10 +1736,10 @@ <h4><a class="toc-backref" href="#id258" role="doc-backlink"><span class="sectio
 <figure class="align-center" id="rewards">
 <a class="reference internal image-reference" href="../_images/rewards.png"><img alt="DPO Training Rewards" src="../_images/rewards.png" style="width: 758.4px; height: 758.4px;" /></a>
 <figcaption>
-<p><span class="caption-number">Fig. 6.7 </span><span class="caption-text">DPO Training Rewards</span><a class="headerlink" href="#rewards" title="Permalink to this image">¶</a></p>
+<p><span class="caption-number">Fig. 7.7 </span><span class="caption-text">DPO Training Rewards</span><a class="headerlink" href="#rewards" title="Permalink to this image">¶</a></p>
 </figcaption>
 </figure>
-<p><a class="reference internal" href="#rewards"><span class="std std-numref">Fig. 6.7</span></a> helps visualize how well the model learns to distinguish between appropriate and inappropriate responses during training. We expect to observe a divergence between the chosen and rejected responses, which indicates the model is learning to distinguish between good and bad responses.</p>
+<p><a class="reference internal" href="#rewards"><span class="std std-numref">Fig. 7.7</span></a> helps visualize how well the model learns to distinguish between appropriate and inappropriate responses during training. We expect to observe a divergence between the chosen and rejected responses, which indicates the model is learning to distinguish between good and bad responses.</p>
 <p>The training dynamics reveal two key phases:</p>
 <ol class="arabic simple">
 <li><p>Initial Learning (0-50 steps): A rapid divergence between chosen and rejected rewards indicates quick initial learning</p></li>
@@ -1759,16 +1768,16 @@ <h4><a class="toc-backref" href="#id258" role="doc-backlink"><span class="sectio
 </div>
 </div>
 </div>
-<p>Congratulations! You have successfully fine-tuned your model using DPO. It should now be available on the Hugging Face Hub (see <a class="reference internal" href="#dpo-hf"><span class="std std-numref">Fig. 6.8</span></a>).</p>
+<p>Congratulations! You have successfully fine-tuned your model using DPO. It should now be available on the Hugging Face Hub (see <a class="reference internal" href="#dpo-hf"><span class="std std-numref">Fig. 7.8</span></a>).</p>
 <figure class="align-center" id="dpo-hf">
 <a class="reference internal image-reference" href="../_images/dpo_hf.png"><img alt="DPO fine-tuned model card on Hugging Face Hub" src="../_images/dpo_hf.png" style="width: 660.0px; height: 573.5px;" /></a>
 <figcaption>
-<p><span class="caption-number">Fig. 6.8 </span><span class="caption-text">DPO fine-tuned model card on Hugging Face Hub</span><a class="headerlink" href="#dpo-hf" title="Permalink to this image">¶</a></p>
+<p><span class="caption-number">Fig. 7.8 </span><span class="caption-text">DPO fine-tuned model card on Hugging Face Hub</span><a class="headerlink" href="#dpo-hf" title="Permalink to this image">¶</a></p>
 </figcaption>
 </figure>
 </section>
 <section id="vibe-check">
-<h4><a class="toc-backref" href="#id259" role="doc-backlink"><span class="section-number">6.4.5.3. </span>Vibe Check</a><a class="headerlink" href="#vibe-check" title="Permalink to this heading">¶</a></h4>
+<h4><a class="toc-backref" href="#id266" role="doc-backlink"><span class="section-number">7.4.5.3. </span>Vibe Check</a><a class="headerlink" href="#vibe-check" title="Permalink to this heading">¶</a></h4>
 <p>Let’s do a quick “vibe check” of our newly aligned model by testing it with some challenging prompts. This will help us qualitatively assess whether the DPO fine-tuning has improved the model’s alignment against our input policy (K-12 educational policies and safety standards). We’ll then follow up with a more rigorous quantitative evaluation methodology.</p>
 <p>We will use HuggingFace transformers API to generate responses from our base and aligned models, locally.</p>
 <div class="cell docutils container">
@@ -1851,11 +1860,11 @@ <h4><a class="toc-backref" href="#id259" role="doc-backlink"><span class="sectio
 </section>
 </section>
 <section id="alignment-evaluation">
-<h3><a class="toc-backref" href="#id260" role="doc-backlink"><span class="section-number">6.4.6. </span>Alignment Evaluation</a><a class="headerlink" href="#alignment-evaluation" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id267" role="doc-backlink"><span class="section-number">7.4.6. </span>Alignment Evaluation</a><a class="headerlink" href="#alignment-evaluation" title="Permalink to this heading">¶</a></h3>
 <p>Evaluating alignment improvements presents unique challenges. Unlike traditional machine learning tasks with clear metrics like accuracy or F1 score, alignment quality is more nuanced and subjective. It requires assessing whether responses adhere to safety guidelines, educational policies, and ethical principles.</p>
 <p>The gold standard for evaluating alignment is human evaluation. Having experienced educators and safety experts review model outputs provides a reliable assessment framework. However, human evaluation is expensive, time-consuming, and difficult to scale. Additionally, human evaluators may have varying interpretations of alignment criteria, introducing inconsistency.</p>
 <p>In this case study, we adopt an LLM-as-judge approach for our evaluation as discussed in <span id="id42">[<a class="reference internal" href="#id157" title="Tharsis T. P. Souza. Tamingllms: a framework for evaluating and aligning language models. 2024. URL: https://www.souzatharsis.com/tamingLLMs/notebooks/evals.html.">Souza, 2024</a>]</span>. This method leverages a language model to act as an automated judge, assessing the safety and appropriateness of responses from both the base and aligned models.</p>
-<p>The evaluation methodology summarized in <a class="reference internal" href="#dpo-evaluation"><span class="std std-numref">Fig. 6.9</span></a> consists of three key components that work together to assess model alignment against our policy:</p>
+<p>The evaluation methodology summarized in <a class="reference internal" href="#dpo-evaluation"><span class="std std-numref">Fig. 7.9</span></a> consists of three key components that work together to assess model alignment against our policy:</p>
 <ol class="arabic simple">
 <li><p>Evaluation Dataset</p>
 <ul class="simple">
@@ -1893,7 +1902,7 @@ <h3><a class="toc-backref" href="#id260" role="doc-backlink"><span class="sectio
 <figure class="align-center" id="dpo-evaluation">
 <a class="reference internal image-reference" href="../_images/dpo_eval.svg"><img alt="DPO Evaluation Results" height="337" src="../_images/dpo_eval.svg" width="941" /></a>
 <figcaption>
-<p><span class="caption-number">Fig. 6.9 </span><span class="caption-text">LLM-as-judge alignment evaluation methodology</span><a class="headerlink" href="#dpo-evaluation" title="Permalink to this image">¶</a></p>
+<p><span class="caption-number">Fig. 7.9 </span><span class="caption-text">LLM-as-judge alignment evaluation methodology</span><a class="headerlink" href="#dpo-evaluation" title="Permalink to this image">¶</a></p>
 </figcaption>
 </figure>
 <p>In the following sections, we will implement the evaluation methodology and evaluate the alignment of our base and aligned models. Quick setup of the evaluation environment are given by the following static variables:</p>
@@ -2402,7 +2411,7 @@ <h3><a class="toc-backref" href="#id260" role="doc-backlink"><span class="sectio
 </section>
 </section>
 <section id="discussion-and-conclusions">
-<h2><a class="toc-backref" href="#id261" role="doc-backlink"><span class="section-number">6.5. </span>Discussion and Conclusions</a><a class="headerlink" href="#discussion-and-conclusions" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id268" role="doc-backlink"><span class="section-number">7.5. </span>Discussion and Conclusions</a><a class="headerlink" href="#discussion-and-conclusions" title="Permalink to this heading">¶</a></h2>
 <p>LLMs are complex systems and alignment is a challenging problem. In this chapter, we discussed how post-training techniques can be used to align a language model to human preferences. In the case study, we demonstrated how to use DPO to align a language model to a user-provider policy further automating the process via synthetic data generation and LLM-as-judge evaluation. Our approach serves as a proof of concept and several considerations should be taken into account when using this methodology in practice.</p>
 <p><strong>Synthetic Data Generation</strong></p>
 <p>LLMs can self improve through synthetic data generation <span id="id43">[<a class="reference internal" href="#id159" title="Jiaxin Huang, Shixiang Shane Gu, Le Hou, Yuexin Wu, Xuezhi Wang, Hongkun Yu, and Jiawei Han. Large language models can self-improve. 2022. URL: https://arxiv.org/abs/2210.11610, arXiv:2210.11610.">Huang <em>et al.</em>, 2022</a>]</span>. This process helps the LLM learn from its own reasoning and improve its overall reasoning ability without relying on human-annotated data. While LLMs can be powerful tools for generating synthetic data, especially in data-scarce domains, it’s important to recognize the potential pitfalls.</p>
@@ -2424,7 +2433,7 @@ <h2><a class="toc-backref" href="#id261" role="doc-backlink"><span class="sectio
 <p>One important limitation of our current implementation is that we did not carefully split our user prompts between in-sample data for fine-tuning and out-of-sample data for evaluation. This means our evaluation metrics may be overly optimistic as the fine-tuned model could be memorizing prompts rather than learning generalizable alignment. Future work should implement proper train/test splits to better assess generalization performance while making sure out/in-sample distributions are similar and representative of real-world data.</p>
 </section>
 <section id="citation">
-<h2><a class="toc-backref" href="#id262" role="doc-backlink"><span class="section-number">6.6. </span>Citation</a><a class="headerlink" href="#citation" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id269" role="doc-backlink"><span class="section-number">7.6. </span>Citation</a><a class="headerlink" href="#citation" title="Permalink to this heading">¶</a></h2>
 <p><a class="reference external" href="http://creativecommons.org/licenses/by-nc-sa/4.0/"><img alt="CC BY-NC-SA 4.0" src="https://licensebuttons.net/l/by-nc-sa/4.0/88x31.png" /></a></p>
 <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="nd">@misc</span><span class="p">{</span><span class="n">tharsistpsouza2024tamingllms</span><span class="p">,</span>
   <span class="n">author</span> <span class="o">=</span> <span class="p">{</span><span class="n">Tharsis</span> <span class="n">T</span><span class="o">.</span> <span class="n">P</span><span class="o">.</span> <span class="n">Souza</span><span class="p">},</span>
@@ -2438,7 +2447,7 @@ <h2><a class="toc-backref" href="#id262" role="doc-backlink"><span class="sectio
 </div>
 </section>
 <section id="references">
-<h2><a class="toc-backref" href="#id263" role="doc-backlink"><span class="section-number">6.7. </span>References</a><a class="headerlink" href="#references" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id270" role="doc-backlink"><span class="section-number">7.7. </span>References</a><a class="headerlink" href="#references" title="Permalink to this heading">¶</a></h2>
 <div class="docutils container" id="id51">
 <div class="citation" id="id118" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span>ABC+4a<span class="fn-bracket">]</span></span>
@@ -2449,7 +2458,7 @@ <h2><a class="toc-backref" href="#id263" role="doc-backlink"><span class="sectio
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id29">ABC+4b</a><span class="fn-bracket">]</span></span>
 <p>Amanda Askell, Jan Brauner, Adrian Colyer, Benjamin Cullen, David Duvenaud, Richard Ngo, Azalia Mirhoseini, Catherine Olsson, Sam Ringer, Liam Skirvin, Jess Smith, Dawn Song, William Saunders, and Jacob Steinhardt. Alignment faking in large language models: reviews. 2024b. URL: <a class="reference external" href="https://assets.anthropic.com/m/24c8d0a3a7d0a1f1/original/Alignment-Faking-in-Large-Language-Models-reviews.pdf">https://assets.anthropic.com/m/24c8d0a3a7d0a1f1/original/Alignment-Faking-in-Large-Language-Models-reviews.pdf</a>.</p>
 </div>
-<div class="citation" id="id180" role="doc-biblioentry">
+<div class="citation" id="id187" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id13">BJN+22</a><span class="fn-bracket">]</span></span>
 <p>Yuntao Bai, Andy Jones, Kamal Ndousse, Amanda Askell, Anna Chen, Nova DasSarma, Dawn Drain, Stanislav Fort, Deep Ganguli, Tom Henighan, Nicholas Joseph, Saurav Kadavath, Jackson Kernion, Tom Conerly, Sheer El-Showk, Nelson Elhage, Zac Hatfield-Dodds, Danny Hernandez, Tristan Hume, Scott Johnston, Shauna Kravec, Liane Lovitt, Neel Nanda, Catherine Olsson, Dario Amodei, Tom Brown, Jack Clark, Sam McCandlish, Chris Olah, Ben Mann, and Jared Kaplan. Training a helpful and harmless assistant with reinforcement learning from human feedback. 2022. URL: <a class="reference external" href="https://arxiv.org/abs/2204.05862">https://arxiv.org/abs/2204.05862</a>, <a class="reference external" href="https://arxiv.org/abs/2204.05862">arXiv:2204.05862</a>.</p>
 </div>
@@ -2457,7 +2466,7 @@ <h2><a class="toc-backref" href="#id263" role="doc-backlink"><span class="sectio
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id34">BKK+22</a><span class="fn-bracket">]</span></span>
 <p>Yuntao Bai, Saurav Kadavath, Sandipan Kundu, Amanda Askell, Jackson Kernion, Andy Jones, Anna Chen, Anna Goldie, Azalia Mirhoseini, Cameron McKinnon, Carol Chen, Catherine Olsson, Christopher Olah, Danny Hernandez, Dawn Drain, Deep Ganguli, Dustin Li, Eli Tran-Johnson, Ethan Perez, Jamie Kerr, Jared Mueller, Jeffrey Ladish, Joshua Landau, Kamal Ndousse, Kamile Lukosuite, Liane Lovitt, Michael Sellitto, Nelson Elhage, Nicholas Schiefer, Noemi Mercado, Nova DasSarma, Robert Lasenby, Robin Larson, Sam Ringer, Scott Johnston, Shauna Kravec, Sheer El Showk, Stanislav Fort, Tamera Lanham, Timothy Telleen-Lawton, Tom Conerly, Tom Henighan, Tristan Hume, Samuel R. Bowman, Zac Hatfield-Dodds, Ben Mann, Dario Amodei, Nicholas Joseph, Sam McCandlish, Tom Brown, and Jared Kaplan. Constitutional ai: harmlessness from ai feedback. 2022. URL: <a class="reference external" href="https://arxiv.org/abs/2212.08073">https://arxiv.org/abs/2212.08073</a>, <a class="reference external" href="https://arxiv.org/abs/2212.08073">arXiv:2212.08073</a>.</p>
 </div>
-<div class="citation" id="id186" role="doc-biblioentry">
+<div class="citation" id="id193" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id19">Blo23</a><span class="fn-bracket">]</span></span>
 <p>NeurIPS Blog. Announcing the neurips 2023 paper awards. 2023. NeurIPS 2023 Awards. URL: <a class="reference external" href="https://blog.neurips.cc/2023/12/11/announcing-the-neurips-2023-paper-awards/">https://blog.neurips.cc/2023/12/11/announcing-the-neurips-2023-paper-awards/</a>.</p>
 </div>
@@ -2465,7 +2474,7 @@ <h2><a class="toc-backref" href="#id263" role="doc-backlink"><span class="sectio
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id49">CCL+24</a><span class="fn-bracket">]</span></span>
 <p>Guiming Hardy Chen, Shunian Chen, Ziche Liu, Feng Jiang, and Benyou Wang. Humans or llms as the judge? a study on judgement biases. 2024. URL: <a class="reference external" href="https://arxiv.org/abs/2402.10669">https://arxiv.org/abs/2402.10669</a>, <a class="reference external" href="https://arxiv.org/abs/2402.10669">arXiv:2402.10669</a>.</p>
 </div>
-<div class="citation" id="id182" role="doc-biblioentry">
+<div class="citation" id="id189" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id9">DPHZ23</a><span class="fn-bracket">]</span></span>
 <p>Tim Dettmers, Artidoro Pagnoni, Ari Holtzman, and Luke Zettlemoyer. Qlora: efficient finetuning of quantized llms. 2023. URL: <a class="reference external" href="https://arxiv.org/abs/2305.14314">https://arxiv.org/abs/2305.14314</a>, <a class="reference external" href="https://arxiv.org/abs/2305.14314">arXiv:2305.14314</a>.</p>
 </div>
@@ -2478,11 +2487,11 @@ <h2><a class="toc-backref" href="#id263" role="doc-backlink"><span class="sectio
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id7">Fac24</a><span class="fn-bracket">]</span></span>
 <p>Hugging Face. Zephyr. 2024. Zephyr. URL: <a class="reference external" href="https://huggingface.co/HuggingFaceH4/zephyr-7b-alpha">https://huggingface.co/HuggingFaceH4/zephyr-7b-alpha</a>.</p>
 </div>
-<div class="citation" id="id184" role="doc-biblioentry">
+<div class="citation" id="id191" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id17">Fac4c</a><span class="fn-bracket">]</span></span>
 <p>Hugging Face. Rlhf. 2024c. RLHF. URL: <a class="reference external" href="https://huggingface.co/blog/rlhf">https://huggingface.co/blog/rlhf</a>.</p>
 </div>
-<div class="citation" id="id188" role="doc-biblioentry">
+<div class="citation" id="id195" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id21">Fac4d</a><span class="fn-bracket">]</span></span>
 <p>Hugging Face. Trl. 2024d. TRL. URL: <a class="reference external" href="https://huggingface.co/docs/trl/en/index">https://huggingface.co/docs/trl/en/index</a>.</p>
 </div>
@@ -2511,7 +2520,7 @@ <h2><a class="toc-backref" href="#id263" role="doc-backlink"><span class="sectio
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id22">HDN+24</a><span class="fn-bracket">]</span></span>
 <p>Zhenyu Hou, Pengfan Du, Yilin Niu, Zhengxiao Du, Aohan Zeng, Xiao Liu, Minlie Huang, Hongning Wang, Jie Tang, and Yuxiao Dong. Does rlhf scale? exploring the impacts from data, model, and method. 2024. URL: <a class="reference external" href="https://arxiv.org/abs/2412.06000">https://arxiv.org/abs/2412.06000</a>, <a class="reference external" href="https://arxiv.org/abs/2412.06000">arXiv:2412.06000</a>.</p>
 </div>
-<div class="citation" id="id181" role="doc-biblioentry">
+<div class="citation" id="id188" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id8">HSW+21</a><span class="fn-bracket">]</span></span>
 <p>Edward J. Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, and Weizhu Chen. Lora: low-rank adaptation of large language models. 2021. URL: <a class="reference external" href="https://arxiv.org/abs/2106.09685">https://arxiv.org/abs/2106.09685</a>, <a class="reference external" href="https://arxiv.org/abs/2106.09685">arXiv:2106.09685</a>.</p>
 </div>
@@ -2557,7 +2566,7 @@ <h2><a class="toc-backref" href="#id263" role="doc-backlink"><span class="sectio
 <span class="backrefs">(<a role="doc-backlink" href="#id1">1</a>,<a role="doc-backlink" href="#id11">2</a>,<a role="doc-backlink" href="#id18">3</a>,<a role="doc-backlink" href="#id20">4</a>)</span>
 <p>Rafael Rafailov, Archit Sharma, Eric Mitchell, Stefano Ermon, Christopher D. Manning, and Chelsea Finn. Direct preference optimization: your language model is secretly a reward model. 2024. URL: <a class="reference external" href="https://arxiv.org/abs/2305.18290">https://arxiv.org/abs/2305.18290</a>, <a class="reference external" href="https://arxiv.org/abs/2305.18290">arXiv:2305.18290</a>.</p>
 </div>
-<div class="citation" id="id185" role="doc-biblioentry">
+<div class="citation" id="id192" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id16">SWD+17</a><span class="fn-bracket">]</span></span>
 <p>John Schulman, Filip Wolski, Prafulla Dhariwal, Alec Radford, and Oleg Klimov. Proximal policy optimization algorithms. 2017. URL: <a class="reference external" href="https://arxiv.org/abs/1707.06347">https://arxiv.org/abs/1707.06347</a>, <a class="reference external" href="https://arxiv.org/abs/1707.06347">arXiv:1707.06347</a>.</p>
 </div>
@@ -2578,7 +2587,7 @@ <h2><a class="toc-backref" href="#id263" role="doc-backlink"><span class="sectio
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id25">SRvERH24</a><span class="fn-bracket">]</span></span>
 <p>Márton Szép, Daniel Rueckert, Rüdiger von Eisenhart-Rothe, and Florian Hinterwimmer. A practical guide to fine-tuning language models with limited data. 2024. URL: <a class="reference external" href="https://arxiv.org/abs/2411.09539">https://arxiv.org/abs/2411.09539</a>, <a class="reference external" href="https://arxiv.org/abs/2411.09539">arXiv:2411.09539</a>.</p>
 </div>
-<div class="citation" id="id179" role="doc-biblioentry">
+<div class="citation" id="id186" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id13">TMS+23</a><span class="fn-bracket">]</span></span>
 <p>Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cristian Canton Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez, Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushkar Mishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing Ellen Tan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, and Thomas Scialom. Llama 2: open foundation and fine-tuned chat models. 2023. URL: <a class="reference external" href="https://arxiv.org/abs/2307.09288">https://arxiv.org/abs/2307.09288</a>, <a class="reference external" href="https://arxiv.org/abs/2307.09288">arXiv:2307.09288</a>.</p>
 </div>
@@ -2590,7 +2599,7 @@ <h2><a class="toc-backref" href="#id263" role="doc-backlink"><span class="sectio
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id38">WYG+24</a><span class="fn-bracket">]</span></span>
 <p>Tianhao Wu, Weizhe Yuan, Olga Golovneva, Jing Xu, Yuandong Tian, Jiantao Jiao, Jason Weston, and Sainbayar Sukhbaatar. Meta-rewarding language models: self-improving alignment with llm-as-a-meta-judge. 2024. URL: <a class="reference external" href="https://arxiv.org/abs/2407.19594">https://arxiv.org/abs/2407.19594</a>, <a class="reference external" href="https://arxiv.org/abs/2407.19594">arXiv:2407.19594</a>.</p>
 </div>
-<div class="citation" id="id183" role="doc-biblioentry">
+<div class="citation" id="id190" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id15">XFG+24</a><span class="fn-bracket">]</span></span>
 <p>Shusheng Xu, Wei Fu, Jiaxuan Gao, Wenjie Ye, Weilin Liu, Zhiyu Mei, Guangju Wang, Chao Yu, and Yi Wu. Is dpo superior to ppo for llm alignment? a comprehensive study. 2024. URL: <a class="reference external" href="https://arxiv.org/abs/2404.10719">https://arxiv.org/abs/2404.10719</a>, <a class="reference external" href="https://arxiv.org/abs/2404.10719">arXiv:2404.10719</a>.</p>
 </div>
@@ -2628,11 +2637,11 @@ <h2><a class="toc-backref" href="#id263" role="doc-backlink"><span class="sectio
             <div class="inner"><ul class="page-nav">
   <li class="prev">
     <a href="safety.html"
-       title="previous chapter">← <span class="section-number">5. </span>Safety</a>
+       title="previous chapter">← <span class="section-number">6. </span>Safety</a>
   </li>
   <li class="next">
     <a href="local.html"
-       title="next chapter"><span class="section-number">7. </span>Local LLMs in Practice →</a>
+       title="next chapter"><span class="section-number">8. </span>Local LLMs in Practice →</a>
   </li>
 </ul><div class="footer" role="contentinfo">
       &#169; Copyright Tharsis T. P. Souza, 2024.
diff --git a/tamingllms/_build/html/notebooks/cost.html b/tamingllms/_build/html/notebooks/cost.html
index 007349f..c9648b4 100644
--- a/tamingllms/_build/html/notebooks/cost.html
+++ b/tamingllms/_build/html/notebooks/cost.html
@@ -4,7 +4,7 @@
     <meta charset="utf-8">
     <meta name="viewport" content="width=device-width,initial-scale=1"><meta name="generator" content="Docutils 0.19: https://docutils.sourceforge.io/" />
 
-      <title>8. The Falling Cost Paradox</title>
+      <title>9. The Falling Cost Paradox</title>
     
           <link rel="stylesheet" href="../_static/pygments.css" type="text/css" />
           <link rel="stylesheet" href="../_static/theme.css " type="text/css" />
@@ -37,7 +37,7 @@
     
   <link rel="index" title="Index" href="../genindex.html" />
   <link rel="search" title="Search" href="../search.html" />
-  <link rel="prev" title="7. Local LLMs in Practice" href="local.html" /> 
+  <link rel="prev" title="8. Local LLMs in Practice" href="local.html" /> 
   </head>
 
   <body>
@@ -153,6 +153,15 @@
             
 
             
+          </li>
+
+        
+          <li class="toctree-l1 ">
+            
+              <a href="input.html" class="reference internal ">Managing Input Data</a>
+            
+
+            
           </li>
 
         
@@ -220,14 +229,14 @@
   <ul class="breadcrumbs">
     <li><a href="../markdown/toc.html">Docs</a> &raquo;</li>
     
-    <li><span class="section-number">8. </span>The Falling Cost Paradox</li>
+    <li><span class="section-number">9. </span>The Falling Cost Paradox</li>
   </ul>
   
 
   <ul class="page-nav">
   <li class="prev">
     <a href="local.html"
-       title="previous chapter">← <span class="section-number">7. </span>Local LLMs in Practice</a>
+       title="previous chapter">← <span class="section-number">8. </span>Local LLMs in Practice</a>
   </li>
 </ul>
   
@@ -236,7 +245,7 @@
           <div class="content" role="main" v-pre>
             
   <section class="tex2jax_ignore mathjax_ignore" id="the-falling-cost-paradox">
-<span id="cost"></span><h1><a class="toc-backref" href="#id194" role="doc-backlink"><span class="section-number">8. </span>The Falling Cost Paradox</a><a class="headerlink" href="#the-falling-cost-paradox" title="Permalink to this heading">¶</a></h1>
+<span id="cost"></span><h1><a class="toc-backref" href="#id201" role="doc-backlink"><span class="section-number">9. </span>The Falling Cost Paradox</a><a class="headerlink" href="#the-falling-cost-paradox" title="Permalink to this heading">¶</a></h1>
 <blockquote class="epigraph">
 <div><p>It is a confusion of ideas to suppose that the economical use of fuel is equivalent to diminished consumption. <br>
 The very contrary is the truth.</p>
@@ -245,37 +254,37 @@
 <nav class="contents" id="contents">
 <p class="topic-title">Contents</p>
 <ul class="simple">
-<li><p><a class="reference internal" href="#the-falling-cost-paradox" id="id194">The Falling Cost Paradox</a></p>
+<li><p><a class="reference internal" href="#the-falling-cost-paradox" id="id201">The Falling Cost Paradox</a></p>
 <ul>
-<li><p><a class="reference internal" href="#why-optimization-matters-more-than-ever" id="id195">Why Optimization Matters More Than Ever</a></p></li>
-<li><p><a class="reference internal" href="#right-sizing-llms-a-strategic-approach" id="id196">Right-Sizing LLMs: A Strategic Approach</a></p>
+<li><p><a class="reference internal" href="#why-optimization-matters-more-than-ever" id="id202">Why Optimization Matters More Than Ever</a></p></li>
+<li><p><a class="reference internal" href="#right-sizing-llms-a-strategic-approach" id="id203">Right-Sizing LLMs: A Strategic Approach</a></p>
 <ul>
-<li><p><a class="reference internal" href="#metrics" id="id197">Metrics</a></p></li>
-<li><p><a class="reference internal" href="#requirements" id="id198">Requirements</a></p>
+<li><p><a class="reference internal" href="#metrics" id="id204">Metrics</a></p></li>
+<li><p><a class="reference internal" href="#requirements" id="id205">Requirements</a></p>
 <ul>
-<li><p><a class="reference internal" href="#business-requirements" id="id199">Business Requirements</a></p></li>
-<li><p><a class="reference internal" href="#performance-requirements" id="id200">Performance Requirements</a></p></li>
-<li><p><a class="reference internal" href="#operational-requirements" id="id201">Operational Requirements</a></p></li>
-<li><p><a class="reference internal" href="#technical-requirements" id="id202">Technical Requirements</a></p></li>
+<li><p><a class="reference internal" href="#business-requirements" id="id206">Business Requirements</a></p></li>
+<li><p><a class="reference internal" href="#performance-requirements" id="id207">Performance Requirements</a></p></li>
+<li><p><a class="reference internal" href="#operational-requirements" id="id208">Operational Requirements</a></p></li>
+<li><p><a class="reference internal" href="#technical-requirements" id="id209">Technical Requirements</a></p></li>
 </ul>
 </li>
 </ul>
 </li>
-<li><p><a class="reference internal" href="#quantization" id="id203">Quantization</a></p></li>
-<li><p><a class="reference internal" href="#check-list" id="id204">Check-list</a></p></li>
-<li><p><a class="reference internal" href="#conclusion" id="id205">Conclusion</a></p></li>
-<li><p><a class="reference internal" href="#references" id="id206">References</a></p></li>
+<li><p><a class="reference internal" href="#quantization" id="id210">Quantization</a></p></li>
+<li><p><a class="reference internal" href="#check-list" id="id211">Check-list</a></p></li>
+<li><p><a class="reference internal" href="#conclusion" id="id212">Conclusion</a></p></li>
+<li><p><a class="reference internal" href="#references" id="id213">References</a></p></li>
 </ul>
 </li>
 </ul>
 </nav>
 <section id="why-optimization-matters-more-than-ever">
-<h2><a class="toc-backref" href="#id195" role="doc-backlink"><span class="section-number">8.1. </span>Why Optimization Matters More Than Ever</a><a class="headerlink" href="#why-optimization-matters-more-than-ever" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id202" role="doc-backlink"><span class="section-number">9.1. </span>Why Optimization Matters More Than Ever</a><a class="headerlink" href="#why-optimization-matters-more-than-ever" title="Permalink to this heading">¶</a></h2>
 <p>According to recent analysis from a16z <span id="id1">[<a class="reference internal" href="#id97" title="Andreessen Horowitz. Llmflation: understanding and mitigating llm inference cost. Blog Post, 2024. Analysis of LLM inference costs and strategies for optimization. URL: https://a16z.com/llmflation-llm-inference-cost/.">Andreessen Horowitz, 2024</a>]</span>, the cost of LLM inference is decreasing by approximately 10x every year - a rate that outpaces even Moore’s Law in the PC revolution or Edholm’s Law during the bandwidth explosion of the dot-com era.</p>
 <figure class="align-center" id="llmflation">
 <a class="reference internal image-reference" href="../_images/llmflation.png"><img alt="LLMflation" src="../_images/llmflation.png" style="width: 900.0px; height: 663.9px;" /></a>
 <figcaption>
-<p><span class="caption-number">Fig. 8.1 </span><span class="caption-text">LLMflation <span id="id2">[<a class="reference internal" href="#id97" title="Andreessen Horowitz. Llmflation: understanding and mitigating llm inference cost. Blog Post, 2024. Analysis of LLM inference costs and strategies for optimization. URL: https://a16z.com/llmflation-llm-inference-cost/.">Andreessen Horowitz, 2024</a>]</span>: The cost of LLM inference is decreasing by approximately 10x every year.</span><a class="headerlink" href="#llmflation" title="Permalink to this image">¶</a></p>
+<p><span class="caption-number">Fig. 9.1 </span><span class="caption-text">LLMflation <span id="id2">[<a class="reference internal" href="#id97" title="Andreessen Horowitz. Llmflation: understanding and mitigating llm inference cost. Blog Post, 2024. Analysis of LLM inference costs and strategies for optimization. URL: https://a16z.com/llmflation-llm-inference-cost/.">Andreessen Horowitz, 2024</a>]</span>: The cost of LLM inference is decreasing by approximately 10x every year.</span><a class="headerlink" href="#llmflation" title="Permalink to this image">¶</a></p>
 </figcaption>
 </figure>
 <p>A model achieving an MMLU score of 42 that cost $60 per million tokens in late 2021 can now be run for just $0.06 per million tokens. For higher-capability models scoring 83 on MMLU, prices have fallen by a factor of 62 since GPT-4’s introduction in March 2023.</p>
@@ -330,16 +339,16 @@ <h2><a class="toc-backref" href="#id195" role="doc-backlink"><span class="sectio
 <p>Motivated by this insight, in the next sections we will dive into the factors that drive LLM cost decay and how to optimize LLM usage in practical applications. The discussion will explore key optimization areas including inference optimization through techniques like Flash Attention and cached prompts, model compression via quantization and distillation, and practical implementation patterns such as response caching, batch processing, and early stopping - all aimed at achieving efficient usage and cost reductions while maintaining model performance and reliability.</p>
 </section>
 <section id="right-sizing-llms-a-strategic-approach">
-<h2><a class="toc-backref" href="#id196" role="doc-backlink"><span class="section-number">8.2. </span>Right-Sizing LLMs: A Strategic Approach</a><a class="headerlink" href="#right-sizing-llms-a-strategic-approach" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id203" role="doc-backlink"><span class="section-number">9.2. </span>Right-Sizing LLMs: A Strategic Approach</a><a class="headerlink" href="#right-sizing-llms-a-strategic-approach" title="Permalink to this heading">¶</a></h2>
 <p>Before implementing cost optimization strategies for LLMs, organizations must develop a comprehensive understanding of their own requirements and constraints. This systematic approach prevents both over-engineering and under-provisioning, leading to more efficient and cost-effective implementations.</p>
 <p>In this section, we define key performance and cost related metrics that will guide our discussion. Then we propose a set of requirements practitioners should consider before we dive into cost optimization techniques.</p>
 <section id="metrics">
-<h3><a class="toc-backref" href="#id197" role="doc-backlink"><span class="section-number">8.2.1. </span>Metrics</a><a class="headerlink" href="#metrics" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id204" role="doc-backlink"><span class="section-number">9.2.1. </span>Metrics</a><a class="headerlink" href="#metrics" title="Permalink to this heading">¶</a></h3>
 </section>
 <section id="requirements">
-<h3><a class="toc-backref" href="#id198" role="doc-backlink"><span class="section-number">8.2.2. </span>Requirements</a><a class="headerlink" href="#requirements" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id205" role="doc-backlink"><span class="section-number">9.2.2. </span>Requirements</a><a class="headerlink" href="#requirements" title="Permalink to this heading">¶</a></h3>
 <section id="business-requirements">
-<h4><a class="toc-backref" href="#id199" role="doc-backlink"><span class="section-number">8.2.2.1. </span>Business Requirements</a><a class="headerlink" href="#business-requirements" title="Permalink to this heading">¶</a></h4>
+<h4><a class="toc-backref" href="#id206" role="doc-backlink"><span class="section-number">9.2.2.1. </span>Business Requirements</a><a class="headerlink" href="#business-requirements" title="Permalink to this heading">¶</a></h4>
 <p>First, one needs to define the problem to be solved and to what extent it is worth to be solved. Use case requirements form the foundation of any LLM implementation project. A clear definition of the specific business problema and task to be accomplished must be established upfront, along with concrete performance metrics covering accuracy, latency and throughput. This should be accompanied by well-defined cost-per-transaction targets, clear ROI expectations, and a strategic allocation of budgets across different use cases to ensure resources are optimally distributed.</p>
 <p>Budget and ROI considerations are critical for ensuring the long-term viability of LLM implementations. Organizations must establish clear spending limits that align with their financial capabilities while defining realistic cost-per-transaction targets. ROI expectations need to be carefully established through detailed analysis, followed by a strategic allocation of budgets across various use cases based on their business impact and priority.</p>
 <p>Compliance and security requirements cannot be overlooked. This involves a thorough identification of all applicable regulatory requirements and the establishment of robust data handling standards. Organizations must specify comprehensive audit requirements to maintain transparency and accountability, while implementing appropriate security controls to protect sensitive data and system access.</p>
@@ -347,17 +356,17 @@ <h4><a class="toc-backref" href="#id199" role="doc-backlink"><span class="sectio
 <p>Chapter <a class="reference internal" href="local.html#local"><span class="std std-ref">Local LLMs in Practice</span></a> provides a detailed discussion on relevant considerations when <a class="reference internal" href="local.html#local-model-selection"><span class="std std-ref">Choosing your Model</span></a>.</p>
 </section>
 <section id="performance-requirements">
-<h4><a class="toc-backref" href="#id200" role="doc-backlink"><span class="section-number">8.2.2.2. </span>Performance Requirements</a><a class="headerlink" href="#performance-requirements" title="Permalink to this heading">¶</a></h4>
+<h4><a class="toc-backref" href="#id207" role="doc-backlink"><span class="section-number">9.2.2.2. </span>Performance Requirements</a><a class="headerlink" href="#performance-requirements" title="Permalink to this heading">¶</a></h4>
 <p>Accuracy and quality form the foundation of any LLM deployment’s performance requirements. At its core, this involves determining the minimum level of accuracy that the model must achieve to be considered successful. This serves as a critical baseline for evaluating model performance and making deployment decisions. Establishing clear evaluation metrics, whether through automated measures or human evaluation processes, provides concrete ways to assess if these thresholds are being met. Continuous monitoring of these accuracy metrics ensures the system maintains its performance over time as usage patterns and data distributions evolve. Chapter <a class="reference internal" href="evals.html#evals"><span class="std std-ref">The Evals Gap</span></a> provides a detailed discussion on how to evaluate the performance of LLM-based applications.</p>
 <p>Latency and throughput requirements are equally crucial for ensuring a positive user experience and system reliability. These specifications define how quickly the system must respond to requests and how many concurrent users it can handle. Response time requirements must be carefully balanced against the computational resources available, while peak load capabilities need to account for usage spikes and growth patterns. The decision between real-time processing for immediate responses versus batch processing for efficiency depends heavily on the use case and user expectations.</p>
 </section>
 <section id="operational-requirements">
-<h4><a class="toc-backref" href="#id201" role="doc-backlink"><span class="section-number">8.2.2.3. </span>Operational Requirements</a><a class="headerlink" href="#operational-requirements" title="Permalink to this heading">¶</a></h4>
+<h4><a class="toc-backref" href="#id208" role="doc-backlink"><span class="section-number">9.2.2.3. </span>Operational Requirements</a><a class="headerlink" href="#operational-requirements" title="Permalink to this heading">¶</a></h4>
 <p>Scale and capacity planning forms the foundation of operational requirements for LLM deployments. This involves a comprehensive analysis of expected system usage and growth patterns to ensure the infrastructure can handle both current and future demands. Organizations must carefully project their daily and monthly API call volumes while calculating the average number of tokens per request to accurately estimate resource needs. Understanding usage patterns, including seasonal variations, enables proper capacity planning. Additionally, developing 12-24 month growth projections helps ensure the infrastructure can scale appropriately as demand increases.</p>
 <p>Reliability and availability requirements are equally critical for maintaining consistent service quality. These specifications define the expected uptime percentage that the system must maintain, typically expressed as a percentage of total operational time. Organizations need to establish clear maintenance windows that minimize disruption to users while ensuring necessary system updates and optimizations can be performed. Comprehensive backup and failover requirements must be specified to ensure business continuity in case of failures. High availability needs should be clearly defined, including redundancy levels and recovery time objectives, to maintain service quality even during unexpected events.</p>
 </section>
 <section id="technical-requirements">
-<h4><a class="toc-backref" href="#id202" role="doc-backlink"><span class="section-number">8.2.2.4. </span>Technical Requirements</a><a class="headerlink" href="#technical-requirements" title="Permalink to this heading">¶</a></h4>
+<h4><a class="toc-backref" href="#id209" role="doc-backlink"><span class="section-number">9.2.2.4. </span>Technical Requirements</a><a class="headerlink" href="#technical-requirements" title="Permalink to this heading">¶</a></h4>
 <p>System integration requirements define how the LLM system will interact and communicate with existing infrastructure and applications. This involves carefully mapping all integration points where the LLM system needs to connect with other systems, establishing standardized data formats and interfaces for seamless communication, implementing robust security measures to protect data in transit, and identifying any technical constraints that could impact integration. Getting these integration requirements right is crucial for ensuring the LLM system can function effectively within the broader technical ecosystem.</p>
 <p>Data management requirements address how information will be stored, processed, and maintained within the LLM system. This encompasses determining appropriate storage solutions for maintaining conversation context and history, selecting and configuring vector databases to enable efficient retrieval-augmented generation (RAG), creating comprehensive data retention policies that balance operational needs with resource constraints, and ensuring all data handling practices comply with relevant privacy regulations. Proper data management is essential for both system performance and regulatory compliance, making it a critical consideration in any LLM implementation.</p>
 <p>This structured approach to requirements analysis enables organizations to:</p>
@@ -372,7 +381,7 @@ <h4><a class="toc-backref" href="#id202" role="doc-backlink"><span class="sectio
 </section>
 </section>
 <section id="quantization">
-<h2><a class="toc-backref" href="#id203" role="doc-backlink"><span class="section-number">8.3. </span>Quantization</a><a class="headerlink" href="#quantization" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id210" role="doc-backlink"><span class="section-number">9.3. </span>Quantization</a><a class="headerlink" href="#quantization" title="Permalink to this heading">¶</a></h2>
 <p>Quantization is a common and relevant technique in making LLMs more efficient and accessible. At a high level, quantization reduces the number of bits used to represent a model’s parameters. The most common form of quantization is to represent model’s weights at lower precision at post-training phase. It has become a standard technique to generate a series of quantized models given a large pre-trained base model.</p>
 <p>While a standard pre-trained LLM might use 32-bit floating-point (FP32) or 16-bit floating-point (FP16) numbers to store its weights, quantized versions can operate at lower precision levels such as 8, 4 or even 2 bits per parameter, reducing memory footprint without proportional losses in performance, necessarily. For instance, for a model of 30 billion parameters, using FP32 means 4 bytes per weight or 120 GB for the whole weights. If the model is quantized such that weights are represented in 1 byte, the memory needed for the model’s weights decreases to 30 GB, hence potentially fitting into consumer grade hardware. This is done at the cost of precision loss, but the trade-off is often worth it though require careful analysis.</p>
 <p>Let’s take a look at model weights of a language model (<code class="docutils literal notranslate"><span class="pre">SmolLM2-135M-Instruct</span></code>) that has been quantized to 2-bit and 16-bit precisions. We will use an utility function <code class="docutils literal notranslate"><span class="pre">load_gguf</span></code> from the <code class="docutils literal notranslate"><span class="pre">taming_utils</span></code> package to load model weights of the quantized models directly from Hugging Face.</p>
@@ -468,21 +477,21 @@ <h2><a class="toc-backref" href="#id203" role="doc-backlink"><span class="sectio
 </div>
 </div>
 </div>
-<p>Quantization is a powerful technique for reducing the memory footprint of LLMs. This can be exemplified by the case of LLaMa 3.3 70B as quantized by <span id="id4">[<a class="reference internal" href="#id86" title="Unsloth. Llama-3.3-70b-instruct-gguf. Hugging Face Model, 2024. GGUF quantized version of Meta's Llama 3.3 70B instruction-tuned model. URL: https://huggingface.co/unsloth/Llama-3.3-70B-Instruct-GGUF.">Unsloth, 2024</a>]</span> <a class="footnote-reference brackets" href="#unsloth" id="id5" role="doc-noteref"><span class="fn-bracket">[</span>2<span class="fn-bracket">]</span></a>. The model’s memory requirements vary significantly based on the quantization level used as demonstrated in <a class="reference internal" href="#quantized"><span class="std std-numref">Fig. 8.2</span></a>.</p>
+<p>Quantization is a powerful technique for reducing the memory footprint of LLMs. This can be exemplified by the case of LLaMa 3.3 70B as quantized by <span id="id4">[<a class="reference internal" href="#id86" title="Unsloth. Llama-3.3-70b-instruct-gguf. Hugging Face Model, 2024. GGUF quantized version of Meta's Llama 3.3 70B instruction-tuned model. URL: https://huggingface.co/unsloth/Llama-3.3-70B-Instruct-GGUF.">Unsloth, 2024</a>]</span> <a class="footnote-reference brackets" href="#unsloth" id="id5" role="doc-noteref"><span class="fn-bracket">[</span>2<span class="fn-bracket">]</span></a>. The model’s memory requirements vary significantly based on the quantization level used as demonstrated in <a class="reference internal" href="#quantized"><span class="std std-numref">Fig. 9.2</span></a>.</p>
 <figure class="align-center" id="quantized">
 <a class="reference internal image-reference" href="../_images/quantized.png"><img alt="Quantized Model Size" src="../_images/quantized.png" style="width: 867.5px; height: 338.0px;" /></a>
 <figcaption>
-<p><span class="caption-number">Fig. 8.2 </span><span class="caption-text">Quantized Model Size: <code class="docutils literal notranslate"><span class="pre">unsloth/Llama-3.3-70B-Instruct-GGUF</span></code></span><a class="headerlink" href="#quantized" title="Permalink to this image">¶</a></p>
+<p><span class="caption-number">Fig. 9.2 </span><span class="caption-text">Quantized Model Size: <code class="docutils literal notranslate"><span class="pre">unsloth/Llama-3.3-70B-Instruct-GGUF</span></code></span><a class="headerlink" href="#quantized" title="Permalink to this image">¶</a></p>
 </figcaption>
 </figure>
 <p>We observe that the quantization process yields remarkable reductions in model size, demonstrating a clear trade-off between precision and memory requirements. The transition from F16 (141.1 GB) to Q8_0 (75 GB) achieves a dramatic 47% reduction in model size while maintaining relatively high numerical precision. Further quantization levels reveal an interesting pattern of diminishing returns - each step down in precision yields progressively smaller absolute size reductions, though the cumulative effect remains significant. At the extreme end, the Q2_K model (26.4 GB) requires only 19% of the storage space of its F16 counterpart <a class="footnote-reference brackets" href="#quantization-levels" id="id6" role="doc-noteref"><span class="fn-bracket">[</span>3<span class="fn-bracket">]</span></a>.</p>
 <p>This wide spectrum of model sizes enables deployment across diverse hardware environments. The lightweight Q2_K variant opens possibilities for running inference on consumer-grade hardware like high-end laptops or desktop computers. In contrast, the full-precision F16 model demands enterprise-grade computing resources with substantial memory capacity. This flexibility in deployment options makes quantization a powerful tool for democratizing access to large language models while managing computational costs.</p>
 <p>While quantization has proven highly effective, there is a limit to how far it can be pushed - specifically, the 1-bit ceiling. A notable advancement in this space is BitNet <span id="id7">[<a class="reference internal" href="#id92" title="Jinheng Wang, Hansong Zhou, Ting Song, Shaoguang Mao, Shuming Ma, Hongyu Wang, Yan Xia, and Furu Wei. 1-bit ai infra: part 1.1, fast and lossless bitnet b1.58 inference on cpus. 2024. URL: https://arxiv.org/abs/2410.16144, arXiv:2410.16144.">Wang <em>et al.</em>, 2024</a>]</span> which pushes the boundaries of extreme quantization.</p>
-<p>BitNet’s implementation, bitnet.cpp, has demonstrated significant performance improvements across both ARM and x86 architectures (see <a class="reference internal" href="#bitnet"><span class="std std-numref">Fig. 8.3</span></a>). When compared to llama.cpp, the framework achieves speedups ranging from 1.37x to 5.07x on ARM processors and 2.37x to 6.17x on x86 systems. These performance gains scale with model size - larger models benefit more substantially from BitNet’s optimizations. The efficiency improvements extend beyond raw speed: energy consumption drops by 55-70% on ARM and 71-82% on x86 processors. Perhaps most impressively, bitnet.cpp enables running a 100B parameter BitNet b1.58 model on a single CPU at speeds matching human reading pace (5-7 tokens per second).</p>
+<p>BitNet’s implementation, bitnet.cpp, has demonstrated significant performance improvements across both ARM and x86 architectures (see <a class="reference internal" href="#bitnet"><span class="std std-numref">Fig. 9.3</span></a>). When compared to llama.cpp, the framework achieves speedups ranging from 1.37x to 5.07x on ARM processors and 2.37x to 6.17x on x86 systems. These performance gains scale with model size - larger models benefit more substantially from BitNet’s optimizations. The efficiency improvements extend beyond raw speed: energy consumption drops by 55-70% on ARM and 71-82% on x86 processors. Perhaps most impressively, bitnet.cpp enables running a 100B parameter BitNet b1.58 model on a single CPU at speeds matching human reading pace (5-7 tokens per second).</p>
 <figure class="align-center" id="bitnet">
 <a class="reference internal image-reference" href="../_images/bitnet.png"><img alt="BitNet" src="../_images/bitnet.png" style="width: 787.5px; height: 436.8px;" /></a>
 <figcaption>
-<p><span class="caption-number">Fig. 8.3 </span><span class="caption-text">BitNet: <span id="id8">[<a class="reference internal" href="#id92" title="Jinheng Wang, Hansong Zhou, Ting Song, Shaoguang Mao, Shuming Ma, Hongyu Wang, Yan Xia, and Furu Wei. 1-bit ai infra: part 1.1, fast and lossless bitnet b1.58 inference on cpus. 2024. URL: https://arxiv.org/abs/2410.16144, arXiv:2410.16144.">Wang <em>et al.</em>, 2024</a>]</span></span><a class="headerlink" href="#bitnet" title="Permalink to this image">¶</a></p>
+<p><span class="caption-number">Fig. 9.3 </span><span class="caption-text">BitNet: <span id="id8">[<a class="reference internal" href="#id92" title="Jinheng Wang, Hansong Zhou, Ting Song, Shaoguang Mao, Shuming Ma, Hongyu Wang, Yan Xia, and Furu Wei. 1-bit ai infra: part 1.1, fast and lossless bitnet b1.58 inference on cpus. 2024. URL: https://arxiv.org/abs/2410.16144, arXiv:2410.16144.">Wang <em>et al.</em>, 2024</a>]</span></span><a class="headerlink" href="#bitnet" title="Permalink to this image">¶</a></p>
 </figcaption>
 </figure>
 <p>The framework’s initial release focused on CPU inference optimization, with particular emphasis on 1-bit LLM architectures (BitNet b1.58). While initial testing shows promising results, these findings are specific to the tested models and kernels (its specialized kernels are carefully crafted to exploit the unique characteristics of these extremely quantized models). Further validation is needed before generalizing these results across different architectures and use cases.</p>
@@ -491,7 +500,7 @@ <h2><a class="toc-backref" href="#id203" role="doc-backlink"><span class="sectio
 <p>Each reduction in precision risks performance degradation. Finding optimal quantization schemes remains an active research area. See Case Study on Quantization for Local Models in Chapter <a class="reference internal" href="local.html#local"><span class="std std-ref">Local LLMs in Practice</span></a> for more details.</p>
 </section>
 <section id="check-list">
-<h2><a class="toc-backref" href="#id204" role="doc-backlink"><span class="section-number">8.4. </span>Check-list</a><a class="headerlink" href="#check-list" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id211" role="doc-backlink"><span class="section-number">9.4. </span>Check-list</a><a class="headerlink" href="#check-list" title="Permalink to this heading">¶</a></h2>
 <p><strong>Planning and Requirements</strong></p>
 <ul class="contains-task-list simple">
 <li class="task-list-item"><p><input class="task-list-item-checkbox" disabled="disabled" type="checkbox"> Start with a clear understanding of your application’s needs and the factors that contribute to LLM costs</p></li>
@@ -525,7 +534,7 @@ <h2><a class="toc-backref" href="#id204" role="doc-backlink"><span class="sectio
 </ul>
 </section>
 <section id="conclusion">
-<h2><a class="toc-backref" href="#id205" role="doc-backlink"><span class="section-number">8.5. </span>Conclusion</a><a class="headerlink" href="#conclusion" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id212" role="doc-backlink"><span class="section-number">9.5. </span>Conclusion</a><a class="headerlink" href="#conclusion" title="Permalink to this heading">¶</a></h2>
 <p><a class="reference external" href="http://creativecommons.org/licenses/by-nc-sa/4.0/"><img alt="CC BY-NC-SA 4.0" src="https://licensebuttons.net/l/by-nc-sa/4.0/88x31.png" /></a></p>
 <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="nd">@misc</span><span class="p">{</span><span class="n">tharsistpsouza2024tamingllms</span><span class="p">,</span>
   <span class="n">author</span> <span class="o">=</span> <span class="p">{</span><span class="n">Tharsis</span> <span class="n">T</span><span class="o">.</span> <span class="n">P</span><span class="o">.</span> <span class="n">Souza</span><span class="p">},</span>
@@ -539,7 +548,7 @@ <h2><a class="toc-backref" href="#id205" role="doc-backlink"><span class="sectio
 </div>
 </section>
 <section id="references">
-<h2><a class="toc-backref" href="#id206" role="doc-backlink"><span class="section-number">8.6. </span>References</a><a class="headerlink" href="#references" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id213" role="doc-backlink"><span class="section-number">9.6. </span>References</a><a class="headerlink" href="#references" title="Permalink to this heading">¶</a></h2>
 <div class="docutils container" id="id9">
 <div class="citation" id="id92" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span>WZS+24<span class="fn-bracket">]</span></span>
@@ -552,7 +561,7 @@ <h2><a class="toc-backref" href="#id206" role="doc-backlink"><span class="sectio
 <p>Andreessen Horowitz. Llmflation: understanding and mitigating llm inference cost. Blog Post, 2024. Analysis of LLM inference costs and strategies for optimization. URL: <a class="reference external" href="https://a16z.com/llmflation-llm-inference-cost/">https://a16z.com/llmflation-llm-inference-cost/</a>.</p>
 </div>
 <div class="citation" id="id98" role="doc-biblioentry">
-<span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id193">HuggingFace4w</a><span class="fn-bracket">]</span></span>
+<span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id200">HuggingFace4w</a><span class="fn-bracket">]</span></span>
 <p>Hugging Face. Gguf quantization types. Online Documentation, 2024w. Documentation on different quantization types available for GGUF models. URL: <a class="reference external" href="https://huggingface.co/docs/hub/gguf#quantization-types">https://huggingface.co/docs/hub/gguf#quantization-types</a>.</p>
 </div>
 <div class="citation" id="id86" role="doc-biblioentry">
@@ -573,7 +582,7 @@ <h2><a class="toc-backref" href="#id206" role="doc-backlink"><span class="sectio
 </aside>
 <aside class="footnote brackets" id="quantization-levels" role="note">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id6">3</a><span class="fn-bracket">]</span></span>
-<p>You may have noticed quantization levels have a special notation. Including the bit width in the name of the model but also quantization types (e.g. _K, _0). You can find more information about the quantization levels in <span id="id193">[<a class="reference internal" href="local.html#id130" title="Hugging Face. Gguf quantization types. Online Documentation, 2024w. Documentation on different quantization types available for GGUF models. URL: https://huggingface.co/docs/hub/gguf#quantization-types.">Hugging Face, 2024w</a>]</span>.</p>
+<p>You may have noticed quantization levels have a special notation. Including the bit width in the name of the model but also quantization types (e.g. _K, _0). You can find more information about the quantization levels in <span id="id200">[<a class="reference internal" href="local.html#id130" title="Hugging Face. Gguf quantization types. Online Documentation, 2024w. Documentation on different quantization types available for GGUF models. URL: https://huggingface.co/docs/hub/gguf#quantization-types.">Hugging Face, 2024w</a>]</span>.</p>
 </aside>
 </aside>
 </section>
@@ -604,7 +613,7 @@ <h2><a class="toc-backref" href="#id206" role="doc-backlink"><span class="sectio
             <div class="inner"><ul class="page-nav">
   <li class="prev">
     <a href="local.html"
-       title="previous chapter">← <span class="section-number">7. </span>Local LLMs in Practice</a>
+       title="previous chapter">← <span class="section-number">8. </span>Local LLMs in Practice</a>
   </li>
 </ul><div class="footer" role="contentinfo">
       &#169; Copyright Tharsis T. P. Souza, 2024.
diff --git a/tamingllms/_build/html/notebooks/evals.html b/tamingllms/_build/html/notebooks/evals.html
index 5ddf80d..5a846c4 100644
--- a/tamingllms/_build/html/notebooks/evals.html
+++ b/tamingllms/_build/html/notebooks/evals.html
@@ -182,6 +182,15 @@
             
 
             
+          </li>
+
+        
+          <li class="toctree-l1 ">
+            
+              <a href="input.html" class="reference internal ">Managing Input Data</a>
+            
+
+            
           </li>
 
         
@@ -253,7 +262,7 @@
           <div class="content" role="main" v-pre>
             
   <section class="tex2jax_ignore mathjax_ignore" id="the-evals-gap">
-<span id="evals"></span><h1><a class="toc-backref" href="#id228" role="doc-backlink"><span class="section-number">3. </span>The Evals Gap</a><a class="headerlink" href="#the-evals-gap" title="Permalink to this heading">¶</a></h1>
+<span id="evals"></span><h1><a class="toc-backref" href="#id235" role="doc-backlink"><span class="section-number">3. </span>The Evals Gap</a><a class="headerlink" href="#the-evals-gap" title="Permalink to this heading">¶</a></h1>
 <blockquote class="epigraph">
 <div><p>It doesn’t matter how beautiful your theory is, <br>
 it doesn’t matter how smart you are. <br>
@@ -263,49 +272,49 @@
 <nav class="contents" id="contents">
 <p class="topic-title">Contents</p>
 <ul class="simple">
-<li><p><a class="reference internal" href="#the-evals-gap" id="id228">The Evals Gap</a></p>
+<li><p><a class="reference internal" href="#the-evals-gap" id="id235">The Evals Gap</a></p>
 <ul>
-<li><p><a class="reference internal" href="#introduction" id="id229">Introduction</a></p></li>
-<li><p><a class="reference internal" href="#non-deterministic-generative-machines" id="id230">Non-Deterministic Generative Machines</a></p></li>
-<li><p><a class="reference internal" href="#emerging-properties" id="id231">Emerging Properties</a></p></li>
-<li><p><a class="reference internal" href="#problem-statement" id="id232">Problem Statement</a></p></li>
-<li><p><a class="reference internal" href="#evals-design" id="id233">Evals Design</a></p>
+<li><p><a class="reference internal" href="#introduction" id="id236">Introduction</a></p></li>
+<li><p><a class="reference internal" href="#non-deterministic-generative-machines" id="id237">Non-Deterministic Generative Machines</a></p></li>
+<li><p><a class="reference internal" href="#emerging-properties" id="id238">Emerging Properties</a></p></li>
+<li><p><a class="reference internal" href="#problem-statement" id="id239">Problem Statement</a></p></li>
+<li><p><a class="reference internal" href="#evals-design" id="id240">Evals Design</a></p>
 <ul>
-<li><p><a class="reference internal" href="#conceptual-overview" id="id234">Conceptual Overview</a></p></li>
-<li><p><a class="reference internal" href="#design-considerations" id="id235">Design Considerations</a></p></li>
+<li><p><a class="reference internal" href="#conceptual-overview" id="id241">Conceptual Overview</a></p></li>
+<li><p><a class="reference internal" href="#design-considerations" id="id242">Design Considerations</a></p></li>
 </ul>
 </li>
-<li><p><a class="reference internal" href="#metrics" id="id236">Metrics</a></p></li>
-<li><p><a class="reference internal" href="#evaluators" id="id237">Evaluators</a></p>
+<li><p><a class="reference internal" href="#metrics" id="id243">Metrics</a></p></li>
+<li><p><a class="reference internal" href="#evaluators" id="id244">Evaluators</a></p>
 <ul>
-<li><p><a class="reference internal" href="#model-based-evaluation" id="id238">Model-Based Evaluation</a></p></li>
-<li><p><a class="reference internal" href="#evaluating-evaluators" id="id239">Evaluating Evaluators</a></p></li>
+<li><p><a class="reference internal" href="#model-based-evaluation" id="id245">Model-Based Evaluation</a></p></li>
+<li><p><a class="reference internal" href="#evaluating-evaluators" id="id246">Evaluating Evaluators</a></p></li>
 </ul>
 </li>
-<li><p><a class="reference internal" href="#benchmarks-and-leaderboards" id="id240">Benchmarks and Leaderboards</a></p></li>
-<li><p><a class="reference internal" href="#tools" id="id241">Tools</a></p>
+<li><p><a class="reference internal" href="#benchmarks-and-leaderboards" id="id247">Benchmarks and Leaderboards</a></p></li>
+<li><p><a class="reference internal" href="#tools" id="id248">Tools</a></p>
 <ul>
-<li><p><a class="reference internal" href="#lighteval" id="id242">LightEval</a></p></li>
-<li><p><a class="reference internal" href="#langsmith" id="id243">LangSmith</a></p></li>
-<li><p><a class="reference internal" href="#promptfoo" id="id244">PromptFoo</a></p></li>
-<li><p><a class="reference internal" href="#comparison" id="id245">Comparison</a></p></li>
+<li><p><a class="reference internal" href="#lighteval" id="id249">LightEval</a></p></li>
+<li><p><a class="reference internal" href="#langsmith" id="id250">LangSmith</a></p></li>
+<li><p><a class="reference internal" href="#promptfoo" id="id251">PromptFoo</a></p></li>
+<li><p><a class="reference internal" href="#comparison" id="id252">Comparison</a></p></li>
 </ul>
 </li>
-<li><p><a class="reference internal" href="#conclusion" id="id246">Conclusion</a></p></li>
-<li><p><a class="reference internal" href="#citation" id="id247">Citation</a></p></li>
-<li><p><a class="reference internal" href="#references" id="id248">References</a></p></li>
+<li><p><a class="reference internal" href="#conclusion" id="id253">Conclusion</a></p></li>
+<li><p><a class="reference internal" href="#citation" id="id254">Citation</a></p></li>
+<li><p><a class="reference internal" href="#references" id="id255">References</a></p></li>
 </ul>
 </li>
 </ul>
 </nav>
 <section id="introduction">
-<h2><a class="toc-backref" href="#id229" role="doc-backlink"><span class="section-number">3.1. </span>Introduction</a><a class="headerlink" href="#introduction" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id236" role="doc-backlink"><span class="section-number">3.1. </span>Introduction</a><a class="headerlink" href="#introduction" title="Permalink to this heading">¶</a></h2>
 <p>The advent of LLMs marks a pivotal shift in the landscape of software development and evaluation. Unlike traditional software systems, where deterministic outputs are the norm, LLMs introduce a realm of non-deterministic and generative behaviors that challenge conventional software engineering testing paradigms. This shift is not merely a technical evolution but a fundamental transformation in how we conceive, build, and assess software products.</p>
 <p>For those entrenched in traditional methodologies, the transition to LLM-driven systems may seem daunting. However, ignoring this change is not an option. The reliance on outdated testing frameworks that fail to account for the probabilistic nature of LLMs will inevitably lead to significant setbacks.</p>
 <p>To overcome these challenges, it is imperative to embrace the complexities of LLMs with a proactive mindset. This involves developing robust evaluation frameworks up-front, fostering a product development culture of continuous change, learning and adaptation.</p>
 </section>
 <section id="non-deterministic-generative-machines">
-<h2><a class="toc-backref" href="#id230" role="doc-backlink"><span class="section-number">3.2. </span>Non-Deterministic Generative Machines</a><a class="headerlink" href="#non-deterministic-generative-machines" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id237" role="doc-backlink"><span class="section-number">3.2. </span>Non-Deterministic Generative Machines</a><a class="headerlink" href="#non-deterministic-generative-machines" title="Permalink to this heading">¶</a></h2>
 <p>One of the most fundamental challenges when building products with Large Language Models (LLMs) is their generative and non-deterministic nature. Unlike traditional software systems where the same input reliably produces the same output, LLMs can generate novel text that may not exist in their training data, and produce different responses each time they’re queried - even with identical prompts and input data. This behavior is both a strength and a significant engineering challenge and product challenge.</p>
 <p>When you ask an LLM the same question multiple times, you’ll likely get different responses. This isn’t a bug - it’s a fundamental feature of how these models work. The “temperature” parameter, which controls the randomness of outputs, allows models to be creative and generate diverse responses. However, this same feature makes it difficult to build reliable, testable systems.</p>
 <p>Consider a financial services company using LLMs to generate investment advice. The non-deterministic nature of these models means that:</p>
@@ -440,7 +449,7 @@ <h2><a class="toc-backref" href="#id230" role="doc-backlink"><span class="sectio
 <p>How can one effectively test an LLM-powered system when the same prompt can yield radically different outputs based on a single parameter? Traditional testing relies on predictable inputs and outputs, but LLMs force us to grapple with probabilistic behavior. While lower temperatures may seem safer for critical applications, they don’t necessarily eliminate the underlying uncertainty. This highlights the need for new evaluation paradigms that can handle both deterministic and probabilistic aspects of LLM behavior.</p>
 </section>
 <section id="emerging-properties">
-<h2><a class="toc-backref" href="#id231" role="doc-backlink"><span class="section-number">3.3. </span>Emerging Properties</a><a class="headerlink" href="#emerging-properties" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id238" role="doc-backlink"><span class="section-number">3.3. </span>Emerging Properties</a><a class="headerlink" href="#emerging-properties" title="Permalink to this heading">¶</a></h2>
 <p>Beyond their non-deterministic nature, LLMs present another fascinating characteristic: emergent abilities that spontaneously arise as models scale up in size. These abilities - from basic question answering to complex reasoning - aren’t explicitly programmed but rather emerge “naturally” as the models grow larger and are trained on more data. This makes evaluation fundamentally different from traditional software testing, where capabilities are explicitly coded and can be tested against pre-defined specifications.</p>
 <p><a class="reference internal" href="#id4"><span class="std std-numref">Fig. 3.1</span></a> provides a list of emergent abilities of large language models and the scale. The relationship between model scale and emergent abilities follows a fascinating non-linear pattern. Below certain size thresholds, specific abilities may be completely absent from the model - it simply cannot perform certain tasks, no matter how much you try to coax them out. However, once the model reaches critical points in its scaling journey, these abilities can suddenly manifest in what researchers call a phase transition - a dramatic shift from inability to capability. This unpredictable emergence of capabilities stands in stark contrast to traditional software development, where features are deliberately implemented and can be systematically tested.</p>
 <figure class="align-center" id="id4">
@@ -452,7 +461,7 @@ <h2><a class="toc-backref" href="#id231" role="doc-backlink"><span class="sectio
 <p>The implications for evaluation are critical. While conventional software testing relies on stable test suites and well-defined acceptance criteria, LLM evaluation must contend with a constantly shifting landscape of capabilities. What worked to evaluate a 7B parameter model may be completely inadequate for a 70B parameter model that has developed new emergent abilities. This dynamic nature of LLM capabilities forces us to fundamentally rethink our approach to testing and evaluation.</p>
 </section>
 <section id="problem-statement">
-<h2><a class="toc-backref" href="#id232" role="doc-backlink"><span class="section-number">3.4. </span>Problem Statement</a><a class="headerlink" href="#problem-statement" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id239" role="doc-backlink"><span class="section-number">3.4. </span>Problem Statement</a><a class="headerlink" href="#problem-statement" title="Permalink to this heading">¶</a></h2>
 <p>Consider a practical example that illustrates these challenges: building a Math AI tutoring system for children powered by an LLM. In traditional software development, you would define specific features (like presenting math problems or checking answers) and write tests to verify each function. But with LLMs, you’re not just testing predefined features - you’re trying to evaluate emergent capabilities like adapting explanations to a child’s level, maintaining engagement through conversational learning, and providing age-appropriate safety-bound content.</p>
 <p>This fundamental difference raises critical questions about evaluation:</p>
 <ul class="simple">
@@ -502,7 +511,7 @@ <h2><a class="toc-backref" href="#id232" role="doc-backlink"><span class="sectio
 </table>
 </section>
 <section id="evals-design">
-<h2><a class="toc-backref" href="#id233" role="doc-backlink"><span class="section-number">3.5. </span>Evals Design</a><a class="headerlink" href="#evals-design" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id240" role="doc-backlink"><span class="section-number">3.5. </span>Evals Design</a><a class="headerlink" href="#evals-design" title="Permalink to this heading">¶</a></h2>
 <p>First, it’s important to make a distinction between evaluating an LLM versus evaluating an LLM-based application. While the latter offers foundation capabilities and are typically general-purpose, the former is more specific and tailored to a particular use case. Here, we define an LLM-based application as a system that uses one or more LLMs to perform a specific task. More specifically, an LLM-based application is the combination of one or more LLM models, their associated prompts and parameters to solve a particular business problem.</p>
 <p>That differentiation is important because it changes the scope of evaluation. LLMs are usually evaluated based on their capabilities, which include things like language understanding, reasoning and knowledge. LLM-based applications, instead, should be evaluated based on their end-to-end functionality, performance, and how well they meet business requirements. That distinction has key implications for the design of evaluation systems:</p>
 <ul class="simple">
@@ -589,7 +598,7 @@ <h2><a class="toc-backref" href="#id233" role="doc-backlink"><span class="sectio
 </tbody>
 </table>
 <section id="conceptual-overview">
-<h3><a class="toc-backref" href="#id234" role="doc-backlink"><span class="section-number">3.5.1. </span>Conceptual Overview</a><a class="headerlink" href="#conceptual-overview" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id241" role="doc-backlink"><span class="section-number">3.5.1. </span>Conceptual Overview</a><a class="headerlink" href="#conceptual-overview" title="Permalink to this heading">¶</a></h3>
 <p><a class="reference internal" href="#conceptual"><span class="std std-numref">Fig. 3.2</span></a> demonstrates a conceptual design of key components of LLM Application evaluation.</p>
 <figure class="align-center" id="conceptual">
 <a class="reference internal image-reference" href="../_images/conceptual.png"><img alt="Conceptual Overview" src="../_images/conceptual.png" style="width: 992.8000000000001px; height: 424.0px;" /></a>
@@ -670,7 +679,7 @@ <h3><a class="toc-backref" href="#id234" role="doc-backlink"><span class="sectio
 </ul>
 </section>
 <section id="design-considerations">
-<h3><a class="toc-backref" href="#id235" role="doc-backlink"><span class="section-number">3.5.2. </span>Design Considerations</a><a class="headerlink" href="#design-considerations" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id242" role="doc-backlink"><span class="section-number">3.5.2. </span>Design Considerations</a><a class="headerlink" href="#design-considerations" title="Permalink to this heading">¶</a></h3>
 <p>The design of an LLM application evaluation system depends heavily on the specific use case and business requirements. Here we list important questions for planning an LLM application evaluation system pertaining to each of the key components previously introduced:</p>
 <p><strong>1. Examples (Input Dataset):</strong></p>
 <ul class="simple">
@@ -755,7 +764,7 @@ <h3><a class="toc-backref" href="#id235" role="doc-backlink"><span class="sectio
 </section>
 </section>
 <section id="metrics">
-<h2><a class="toc-backref" href="#id236" role="doc-backlink"><span class="section-number">3.6. </span>Metrics</a><a class="headerlink" href="#metrics" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id243" role="doc-backlink"><span class="section-number">3.6. </span>Metrics</a><a class="headerlink" href="#metrics" title="Permalink to this heading">¶</a></h2>
 <p>The choice of metric depends on the specific task and desired evaluation criteria. However, one can categorize metrics into two broad categories: <strong>intrinsic</strong> and <strong>extrinsic</strong>.</p>
 <ul class="simple">
 <li><p><strong>Intrinsic metrics</strong> focus on the model’s performance on its primary training objective, which is typically to predict the next token in a sequence.  Perplexity is a common intrinsic metric that measures how well the model predicts a given sample of text.</p></li>
@@ -1066,9 +1075,9 @@ <h2><a class="toc-backref" href="#id236" role="doc-backlink"><span class="sectio
 <p>To address these limitations, alternative approaches like <strong>human-based evaluation</strong> and <strong>model-based evaluation</strong> are often used, which will be discussed in the following sections.</p>
 </section>
 <section id="evaluators">
-<h2><a class="toc-backref" href="#id237" role="doc-backlink"><span class="section-number">3.7. </span>Evaluators</a><a class="headerlink" href="#evaluators" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id244" role="doc-backlink"><span class="section-number">3.7. </span>Evaluators</a><a class="headerlink" href="#evaluators" title="Permalink to this heading">¶</a></h2>
 <section id="model-based-evaluation">
-<h3><a class="toc-backref" href="#id238" role="doc-backlink"><span class="section-number">3.7.1. </span>Model-Based Evaluation</a><a class="headerlink" href="#model-based-evaluation" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id245" role="doc-backlink"><span class="section-number">3.7.1. </span>Model-Based Evaluation</a><a class="headerlink" href="#model-based-evaluation" title="Permalink to this heading">¶</a></h3>
 <p>Traditional metrics like BLEU or ROUGE often fall short in capturing the nuanced, contextual, and creative outputs of LLMs. As an alternative we can consider a “Model-based evaluation” approach. A common approach is to use an LLM as a judge. This is an approach that leverages language models themselves to assess the quality of outputs from other language models. This method involves using a model (often a more capable one) to act as an automated judge, evaluating aspects like accuracy, coherence, and relevance of generated content. Unlike traditional metrics that rely on exact matching or statistical measures, model-based evaluation can capture nuanced aspects of language and provide more contextual assessment.</p>
 <p>As discussed in the paper <span id="id6">[<a class="reference internal" href="#id57" title="Zhen Li, Xiaohan Xu, Tao Shen, Can Xu, Jia-Chen Gu, Yuxuan Lai, Chongyang Tao, and Shuai Ma. Leveraging large language models for nlg evaluation: advances and challenges. 2024. URL: https://arxiv.org/abs/2401.07103, arXiv:2401.07103.">Li <em>et al.</em>, 2024</a>]</span>, LLM-based evaluation approaches generally fall into two main categories:</p>
 <ol class="arabic simple">
@@ -1309,7 +1318,7 @@ <h3><a class="toc-backref" href="#id238" role="doc-backlink"><span class="sectio
 <p>One open source solution trying to overcome some of these challenges is Glider <span id="id10">[<a class="reference internal" href="#id120" title="Darshan Deshpande, Selvan Sunitha Ravi, Sky CH-Wang, Bartosz Mielczarek, Anand Kannappan, and Rebecca Qian. Glider: grading llm interactions and decisions using explainable ranking. 2024. URL: https://arxiv.org/abs/2412.14140, arXiv:2412.14140.">Deshpande <em>et al.</em>, 2024</a>]</span>, a 3B evaluator LLM that can score any text input and associated context on arbitrary user defined criteria. Glider is an LLM model trained on 685 domains and 183 criteria whose judgement scores show 91.3% agreement with human judgments, making it suitable for a diverse range of real world applications.</p>
 </section>
 <section id="evaluating-evaluators">
-<h3><a class="toc-backref" href="#id239" role="doc-backlink"><span class="section-number">3.7.2. </span>Evaluating Evaluators</a><a class="headerlink" href="#evaluating-evaluators" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id246" role="doc-backlink"><span class="section-number">3.7.2. </span>Evaluating Evaluators</a><a class="headerlink" href="#evaluating-evaluators" title="Permalink to this heading">¶</a></h3>
 <p>We have discussed how LLMs can be used to evaluate LLM-based aplications. However, how can we evaluate the performance of LLMs that evaluate other LLMs? This is the question that meta evaluation aims to answer. Clearly, the discussion can become quite meta as we need to evaluate the performance of the evaluator to evaluate the performance of the evaluated model. However, one can make a case for two general options:</p>
 <ol class="arabic simple">
 <li><p>Use a gold-standard dataset that is used to evaluate the performance of LLM evaluators using a “metrics-based” approach.</p></li>
@@ -1353,7 +1362,7 @@ <h3><a class="toc-backref" href="#id239" role="doc-backlink"><span class="sectio
 </section>
 </section>
 <section id="benchmarks-and-leaderboards">
-<h2><a class="toc-backref" href="#id240" role="doc-backlink"><span class="section-number">3.8. </span>Benchmarks and Leaderboards</a><a class="headerlink" href="#benchmarks-and-leaderboards" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id247" role="doc-backlink"><span class="section-number">3.8. </span>Benchmarks and Leaderboards</a><a class="headerlink" href="#benchmarks-and-leaderboards" title="Permalink to this heading">¶</a></h2>
 <p>Benchmarks act as standardized tests for LLMs, evaluating their performance across a spectrum of tasks. These tasks simulate real-world applications such as answering questions, generating coherent text, solving mathematical problems, or even writing computer code. They also assess more abstract qualities like fairness, robustness, and cultural understanding.</p>
 <p>Benchmarks can be thought as comprehensive “exams” that probe different “subjects” in order to certify an LLM. They help researchers and developers compare models systematically, in a way LLM performance is comparable while enabling the identification of emergent behaviors or capabilities as models evolve in scale and sophistication.</p>
 <p>The history of LLM benchmarks reflects the evolving priorities of artificial intelligence research, starting with foundational tasks and moving toward complex, real-world challenges. It began in 2018 with the introduction of <strong>GLUE</strong>(General Language Understanding Evaluation) <span id="id12">[<a class="reference internal" href="#id83" title="Alex Wang, Amanpreet Singh, Julian Michael, Felix Hill, Omer Levy, and Samuel R. Bowman. Glue: a multi-task benchmark and analysis platform for natural language understanding. 2019. URL: https://arxiv.org/abs/1804.07461, arXiv:1804.07461.">Wang <em>et al.</em>, 2019</a>]</span>, which set a new standard for evaluating natural language understanding. GLUE measured performance on tasks like sentiment analysis and textual entailment, providing a baseline for assessing the fundamental capabilities of language models. A year later, <strong>SuperGLUE</strong> <span id="id13">[<a class="reference internal" href="#id84" title="Alex Wang, Yada Pruksachatkun, Nikita Nangia, Amanpreet Singh, Julian Michael, Felix Hill, Omer Levy, and Samuel R. Bowman. Superglue: a stickier benchmark for general-purpose language understanding systems. Advances in Neural Information Processing Systems, 2019.">Wang <em>et al.</em>, 2019</a>]</span> expanded on this foundation by introducing more nuanced tasks that tested reasoning and language comprehension at a deeper level, challenging the limits of models like BERT and its successors.</p>
@@ -1398,16 +1407,16 @@ <h2><a class="toc-backref" href="#id240" role="doc-backlink"><span class="sectio
 <p>The ARC-AGI benchmark remained unbeaten for five years as of December 2024 (a minimum score of 85% is required to win) <span id="id28">[<a class="reference internal" href="#id96" title="Francois Chollet. Arc prize 2024 results. ARC Prize Website, 12/08/2024. URL: https://arcprize.org/2024-results.">Chollet, 12/08/2024</a>]</span>. While deep learning has significantly advanced in recent years, pure deep learning approaches perform poorly on the ARC-AGI benchmark. This is because traditional deep learning relies on relating new situations to those encountered during training and lacks the ability to adapt or recombine knowledge for entirely new tasks. ARC Prize 2024 spurred the development of novel AGI reasoning techniques, leading to a significant increase in the state-of-the-art score on the ARC-AGI private evaluation set from 33% in 2023 to 55.5% in 2024. A key takeaway is that algorithmic improvements, rather than massive computational resources, may be key to exceeding the target score for the ARC-AGI benchmark.</p>
 <p>In addition to the benchmarks discussed above, a growing set of domain-specific benchmarks is emerging to help evaluate LLMs in specific verticals, including:</p>
 <ul class="simple">
-<li><p>FinBench <span id="id29">[<a class="reference internal" href="#id218" title="Zhihan Zhang, Yixin Cao, and Lizi Liao. Finbench: benchmarking LLMs in complex financial problem solving and reasoning. 2024. URL: https://openreview.net/forum?id=AeGrf1uY0p.">Zhang <em>et al.</em>, 2024</a>]</span>: Evaluates LLMs in the financial domain, covering tasks such as terminology understanding, temporal reasoning, future forecasting, scenario planning, and numerical modelling.</p></li>
-<li><p>LegalBench <span id="id30">[<a class="reference internal" href="#id216" title="Neel Guha, Julian Nyarko, Daniel E. Ho, Christopher Ré, Adam Chilton, Aditya Narayana, Alex Chohlas-Wood, Austin Peters, Brandon Waldon, Daniel N. Rockmore, Diego Zambrano, Dmitry Talisman, Enam Hoque, Faiz Surani, Frank Fagan, Galit Sarfaty, Gregory M. Dickinson, Haggai Porat, Jason Hegland, Jessica Wu, Joe Nudell, Joel Niklaus, John Nay, Jonathan H. Choi, Kevin Tobia, Margaret Hagan, Megan Ma, Michael Livermore, Nikon Rasumov-Rahe, Nils Holzenberger, Noam Kolt, Peter Henderson, Sean Rehaag, Sharad Goel, Shang Gao, Spencer Williams, Sunny Gandhi, Tom Zur, Varun Iyer, and Zehua Li. Legalbench: a collaboratively built benchmark for measuring legal reasoning in large language models. 2023. URL: https://arxiv.org/abs/2308.11462, arXiv:2308.11462.">Guha <em>et al.</em>, 2023</a>]</span> : Assesses the legal reasoning abilities of LLMs through tasks crowdsourced by legal professionals</p></li>
-<li><p>Berkeley Function Leaderboard (BFCL) <span id="id31">[<a class="reference internal" href="#id219" title="Shishir G. Patil, Tianjun Zhang, Xin Wang, and Joseph E. Gonzalez. Gorilla: large language model connected with massive apis. arXiv preprint arXiv:2305.15334, 2023.">Patil <em>et al.</em>, 2023</a>]</span>: Evaluates LLMs’ function-calling abilities</p></li>
+<li><p>FinBench <span id="id29">[<a class="reference internal" href="#id225" title="Zhihan Zhang, Yixin Cao, and Lizi Liao. Finbench: benchmarking LLMs in complex financial problem solving and reasoning. 2024. URL: https://openreview.net/forum?id=AeGrf1uY0p.">Zhang <em>et al.</em>, 2024</a>]</span>: Evaluates LLMs in the financial domain, covering tasks such as terminology understanding, temporal reasoning, future forecasting, scenario planning, and numerical modelling.</p></li>
+<li><p>LegalBench <span id="id30">[<a class="reference internal" href="#id223" title="Neel Guha, Julian Nyarko, Daniel E. Ho, Christopher Ré, Adam Chilton, Aditya Narayana, Alex Chohlas-Wood, Austin Peters, Brandon Waldon, Daniel N. Rockmore, Diego Zambrano, Dmitry Talisman, Enam Hoque, Faiz Surani, Frank Fagan, Galit Sarfaty, Gregory M. Dickinson, Haggai Porat, Jason Hegland, Jessica Wu, Joe Nudell, Joel Niklaus, John Nay, Jonathan H. Choi, Kevin Tobia, Margaret Hagan, Megan Ma, Michael Livermore, Nikon Rasumov-Rahe, Nils Holzenberger, Noam Kolt, Peter Henderson, Sean Rehaag, Sharad Goel, Shang Gao, Spencer Williams, Sunny Gandhi, Tom Zur, Varun Iyer, and Zehua Li. Legalbench: a collaboratively built benchmark for measuring legal reasoning in large language models. 2023. URL: https://arxiv.org/abs/2308.11462, arXiv:2308.11462.">Guha <em>et al.</em>, 2023</a>]</span> : Assesses the legal reasoning abilities of LLMs through tasks crowdsourced by legal professionals</p></li>
+<li><p>Berkeley Function Leaderboard (BFCL) <span id="id31">[<a class="reference internal" href="#id226" title="Shishir G. Patil, Tianjun Zhang, Xin Wang, and Joseph E. Gonzalez. Gorilla: large language model connected with massive apis. arXiv preprint arXiv:2305.15334, 2023.">Patil <em>et al.</em>, 2023</a>]</span>: Evaluates LLMs’ function-calling abilities</p></li>
 </ul>
 <p>As language models continue to advance in capability and complexity, evaluation frameworks must evolve. Modern benchmarks increasingly incorporate tests for nuanced reasoning, ethical decision-making, and emergent capabilities that weren’t previously measurable. This ongoing evolution reflects a deeper understanding that the true value of language models lies not in achieving high scores on standardized tests with narrow task-specific metrics, but in their ability to meaningfully contribute to human understanding and help solve real-world problems while demonstrating the ability to learn and adapt to new tasks.</p>
 </section>
 <section id="tools">
-<h2><a class="toc-backref" href="#id241" role="doc-backlink"><span class="section-number">3.9. </span>Tools</a><a class="headerlink" href="#tools" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id248" role="doc-backlink"><span class="section-number">3.9. </span>Tools</a><a class="headerlink" href="#tools" title="Permalink to this heading">¶</a></h2>
 <section id="lighteval">
-<h3><a class="toc-backref" href="#id242" role="doc-backlink"><span class="section-number">3.9.1. </span>LightEval</a><a class="headerlink" href="#lighteval" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id249" role="doc-backlink"><span class="section-number">3.9.1. </span>LightEval</a><a class="headerlink" href="#lighteval" title="Permalink to this heading">¶</a></h3>
 <p>LightEval <span id="id32">[<a class="reference internal" href="#id63" title="Clémentine Fourrier, Nathan Habib, Thomas Wolf, and Lewis Tunstall. Lighteval: a lightweight framework for llm evaluation. 2023. URL: https://github.com/huggingface/lighteval.">Fourrier <em>et al.</em>, 2023</a>]</span> is a lightweight framework for evaluation of LLMs across a variety of standard and bespoke metrics and tasks across multiple inference backends via Python SDK and CLI.</p>
 <p>As a motivating example, consider a scenario where financial data has been extracted from SEC financial filings and require econometric analysis. Tasks like estimating autoregressive models for time series forecasting or conducting hypothesis tests on market efficiency are common in financial analysis. Let’s evaluate how well different models perform on this type of task.</p>
 <p>First, we need to select a benchmark to assess LLMs capabilities in this domain. MMLU has a sub-benchmark called Econometrics we can use for this task. <a class="reference internal" href="#mmlu-econometrics"><span class="std std-numref">Table 3.4</span></a> shows a sample of the benchmark dataset from MMLU Econometrics. It consists of multiple-choice questions from econometrics and expected answers.</p>
@@ -1596,7 +1605,7 @@ <h3><a class="toc-backref" href="#id242" role="doc-backlink"><span class="sectio
 <p>In summary, LightEval is a simple yet flexible and comprehensive framework for evaluating LLMs across a wide variety of tasks and metrics. It can serve as a first step in selecting your next LLM for a specific task given the exponential growth in number of (open source) models available <span id="id40">[<a class="reference internal" href="#id73" title="Hugging Face. Number of models on hugging face. https://huggingface.co/spaces/huggingface/open-source-ai-year-in-review-2024?day=4, 2024. Accessed: 12/06/2024.">Hugging Face, 2024</a>]</span>. Its integration with the Hugging Face ecosystem and modular architecture make it particularly powerful for evaluating open source models. For further details, visit the <a class="reference external" href="https://github.com/huggingface/lighteval">official repository</a> <span id="id41">[<a class="reference internal" href="#id63" title="Clémentine Fourrier, Nathan Habib, Thomas Wolf, and Lewis Tunstall. Lighteval: a lightweight framework for llm evaluation. 2023. URL: https://github.com/huggingface/lighteval.">Fourrier <em>et al.</em>, 2023</a>]</span>.</p>
 </section>
 <section id="langsmith">
-<h3><a class="toc-backref" href="#id243" role="doc-backlink"><span class="section-number">3.9.2. </span>LangSmith</a><a class="headerlink" href="#langsmith" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id250" role="doc-backlink"><span class="section-number">3.9.2. </span>LangSmith</a><a class="headerlink" href="#langsmith" title="Permalink to this heading">¶</a></h3>
 <p>Let’s revisit our evaluation example when we were interested in evaluating the quality of summaries generated by different (smaller and cheaper) LLM models compared to a benchmark model (larger and more expensive). Recal the setup:</p>
 <ul class="simple">
 <li><p>Benchmark model: gpt-4o</p></li>
@@ -2004,7 +2013,7 @@ <h3><a class="toc-backref" href="#id243" role="doc-backlink"><span class="sectio
 </figure>
 </section>
 <section id="promptfoo">
-<h3><a class="toc-backref" href="#id244" role="doc-backlink"><span class="section-number">3.9.3. </span>PromptFoo</a><a class="headerlink" href="#promptfoo" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id251" role="doc-backlink"><span class="section-number">3.9.3. </span>PromptFoo</a><a class="headerlink" href="#promptfoo" title="Permalink to this heading">¶</a></h3>
 <p>Promptfoo <span id="id43">[<a class="reference internal" href="#id103" title="promptfoo. Promptfoo: llm testing and evaluation framework. 2024. Open source framework for testing and evaluating LLM prompts. URL: https://www.promptfoo.dev/.">promptfoo, 2024</a>]</span> is an open-source framework designed for evaluating applications that utilize large language models (LLMs). Key features include:</p>
 <ol class="arabic simple">
 <li><p><strong>Automated Testing</strong>: Promptfoo provides automated testing capabilities, allowing developers to run custom evaluations tailored to their applications.</p></li>
@@ -2269,7 +2278,7 @@ <h3 class="rubric" id="prompt-comparison-results-by-section">Prompt Comparison R
 <p>In conclusion, Promptfoo can serve as an effective LLM application evaluation tool particularly for its ability to decouple several components of the evaluation process. Hence enabling the user to focus on the most important aspects of the evaluation given the particular application and criteria making it a valuable and flexible tool for LLM application development.</p>
 </section>
 <section id="comparison">
-<h3><a class="toc-backref" href="#id245" role="doc-backlink"><span class="section-number">3.9.4. </span>Comparison</a><a class="headerlink" href="#comparison" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id252" role="doc-backlink"><span class="section-number">3.9.4. </span>Comparison</a><a class="headerlink" href="#comparison" title="Permalink to this heading">¶</a></h3>
 <p>The following table provides a summarized comparative analysis of three open source frameworks for language models evaluation we have discussed: Lighteval, LangSmith, and Promptfoo. Each framework is assessed based on key features such as integration capabilities, customization options, ease of use, and the ability to facilitate human and LLM collaboration.</p>
 <table class="docutils align-default" id="tool-comparison">
 <caption><span class="caption-number">Table 3.6 </span><span class="caption-text">Comparison of Lighteval, LangSmith, and Promptfoo</span><a class="headerlink" href="#tool-comparison" title="Permalink to this table">¶</a></caption>
@@ -2306,13 +2315,13 @@ <h3><a class="toc-backref" href="#id245" role="doc-backlink"><span class="sectio
 </section>
 </section>
 <section id="conclusion">
-<h2><a class="toc-backref" href="#id246" role="doc-backlink"><span class="section-number">3.10. </span>Conclusion</a><a class="headerlink" href="#conclusion" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id253" role="doc-backlink"><span class="section-number">3.10. </span>Conclusion</a><a class="headerlink" href="#conclusion" title="Permalink to this heading">¶</a></h2>
 <p>Language models have fundamentally transformed how software is developed and evaluated. Unlike conventional systems that produce predictable outputs, LLMs generate varied, probabilistic responses that defy traditional testing approaches. While developers accustomed to deterministic systems may find this shift challenging, continuing to rely on legacy testing methods is unsustainable. These frameworks were not designed to handle the inherent variability of LLM outputs and will ultimately prove inadequate.</p>
 <p>Success requires embracing this new paradigm by implementing comprehensive evaluation strategies early - this is the new Product Requirements Document (PRD) - and cultivating an organizational mindset focused on iteration, experimentation and growth.</p>
 <p>The shift from traditional software testing to LLM evaluation is not just a change in tools but a transformation in mindset. Those who recognize and adapt to this shift will lead the way in harnessing the power of LLMs. However, the cost of inaction is not just technological stagnation, but potential business failure.</p>
 </section>
 <section id="citation">
-<h2><a class="toc-backref" href="#id247" role="doc-backlink"><span class="section-number">3.11. </span>Citation</a><a class="headerlink" href="#citation" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id254" role="doc-backlink"><span class="section-number">3.11. </span>Citation</a><a class="headerlink" href="#citation" title="Permalink to this heading">¶</a></h2>
 <p><a class="reference external" href="http://creativecommons.org/licenses/by-nc-sa/4.0/"><img alt="CC BY-NC-SA 4.0" src="https://licensebuttons.net/l/by-nc-sa/4.0/88x31.png" /></a></p>
 <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="nd">@misc</span><span class="p">{</span><span class="n">tharsistpsouza2024tamingllms</span><span class="p">,</span>
   <span class="n">author</span> <span class="o">=</span> <span class="p">{</span><span class="n">Tharsis</span> <span class="n">T</span><span class="o">.</span> <span class="n">P</span><span class="o">.</span> <span class="n">Souza</span><span class="p">},</span>
@@ -2326,7 +2335,7 @@ <h2><a class="toc-backref" href="#id247" role="doc-backlink"><span class="sectio
 </div>
 </section>
 <section id="references">
-<h2><a class="toc-backref" href="#id248" role="doc-backlink"><span class="section-number">3.12. </span>References</a><a class="headerlink" href="#references" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id255" role="doc-backlink"><span class="section-number">3.12. </span>References</a><a class="headerlink" href="#references" title="Permalink to this heading">¶</a></h2>
 <div class="docutils container" id="id44">
 <div class="citation" id="id65" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id39">ALB+24</a><span class="fn-bracket">]</span></span>
@@ -2393,7 +2402,7 @@ <h2><a class="toc-backref" href="#id248" role="doc-backlink"><span class="sectio
 <span class="backrefs">(<a role="doc-backlink" href="#id32">1</a>,<a role="doc-backlink" href="#id41">2</a>)</span>
 <p>Clémentine Fourrier, Nathan Habib, Thomas Wolf, and Lewis Tunstall. Lighteval: a lightweight framework for llm evaluation. 2023. URL: <a class="reference external" href="https://github.com/huggingface/lighteval">https://github.com/huggingface/lighteval</a>.</p>
 </div>
-<div class="citation" id="id216" role="doc-biblioentry">
+<div class="citation" id="id223" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id30">GNH+23</a><span class="fn-bracket">]</span></span>
 <p>Neel Guha, Julian Nyarko, Daniel E. Ho, Christopher Ré, Adam Chilton, Aditya Narayana, Alex Chohlas-Wood, Austin Peters, Brandon Waldon, Daniel N. Rockmore, Diego Zambrano, Dmitry Talisman, Enam Hoque, Faiz Surani, Frank Fagan, Galit Sarfaty, Gregory M. Dickinson, Haggai Porat, Jason Hegland, Jessica Wu, Joe Nudell, Joel Niklaus, John Nay, Jonathan H. Choi, Kevin Tobia, Margaret Hagan, Megan Ma, Michael Livermore, Nikon Rasumov-Rahe, Nils Holzenberger, Noam Kolt, Peter Henderson, Sean Rehaag, Sharad Goel, Shang Gao, Spencer Williams, Sunny Gandhi, Tom Zur, Varun Iyer, and Zehua Li. Legalbench: a collaboratively built benchmark for measuring legal reasoning in large language models. 2023. URL: <a class="reference external" href="https://arxiv.org/abs/2308.11462">https://arxiv.org/abs/2308.11462</a>, <a class="reference external" href="https://arxiv.org/abs/2308.11462">arXiv:2308.11462</a>.</p>
 </div>
@@ -2426,7 +2435,7 @@ <h2><a class="toc-backref" href="#id248" role="doc-backlink"><span class="sectio
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id15">LHE22</a><span class="fn-bracket">]</span></span>
 <p>Stephanie Lin, Jacob Hilton, and Owain Evans. Truthfulqa: measuring how models mimic human falsehoods. 2022. URL: <a class="reference external" href="https://arxiv.org/abs/2109.07958">https://arxiv.org/abs/2109.07958</a>, <a class="reference external" href="https://arxiv.org/abs/2109.07958">arXiv:2109.07958</a>.</p>
 </div>
-<div class="citation" id="id219" role="doc-biblioentry">
+<div class="citation" id="id226" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id31">PZWG23</a><span class="fn-bracket">]</span></span>
 <p>Shishir G. Patil, Tianjun Zhang, Xin Wang, and Joseph E. Gonzalez. Gorilla: large language model connected with massive apis. <em>arXiv preprint arXiv:2305.15334</em>, 2023.</p>
 </div>
@@ -2470,7 +2479,7 @@ <h2><a class="toc-backref" href="#id248" role="doc-backlink"><span class="sectio
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id38">YYH+24</a><span class="fn-bracket">]</span></span>
 <p>An Yang, Baosong Yang, Binyuan Hui, Bo Zheng, Bowen Yu, Chang Zhou, Chengpeng Li, Chengyuan Li, Dayiheng Liu, Fei Huang, Guanting Dong, Haoran Wei, Huan Lin, Jialong Tang, Jialin Wang, Jian Yang, Jianhong Tu, Jianwei Zhang, Jianxin Ma, Jin Xu, Jingren Zhou, Jinze Bai, Jinzheng He, Junyang Lin, Kai Dang, Keming Lu, Keqin Chen, Kexin Yang, Mei Li, Mingfeng Xue, Na Ni, Pei Zhang, Peng Wang, Ru Peng, Rui Men, Ruize Gao, Runji Lin, Shijie Wang, Shuai Bai, Sinan Tan, Tianhang Zhu, Tianhao Li, Tianyu Liu, Wenbin Ge, Xiaodong Deng, Xiaohuan Zhou, Xingzhang Ren, Xinyu Zhang, Xipin Wei, Xuancheng Ren, Yang Fan, Yang Yao, Yichang Zhang, Yu Wan, Yunfei Chu, Yuqiong Liu, Zeyu Cui, Zhenru Zhang, and Zhihao Fan. Qwen2 technical report. <em>arXiv preprint arXiv:2407.10671</em>, 2024.</p>
 </div>
-<div class="citation" id="id218" role="doc-biblioentry">
+<div class="citation" id="id225" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id29">ZCL24</a><span class="fn-bracket">]</span></span>
 <p>Zhihan Zhang, Yixin Cao, and Lizi Liao. Finbench: benchmarking LLMs in complex financial problem solving and reasoning. 2024. URL: <a class="reference external" href="https://openreview.net/forum?id=AeGrf1uY0p">https://openreview.net/forum?id=AeGrf1uY0p</a>.</p>
 </div>
diff --git a/tamingllms/_build/html/notebooks/input.html b/tamingllms/_build/html/notebooks/input.html
new file mode 100644
index 0000000..19c3783
--- /dev/null
+++ b/tamingllms/_build/html/notebooks/input.html
@@ -0,0 +1,2106 @@
+<!DOCTYPE html>
+<html  lang="en">
+  <head>
+    <meta charset="utf-8">
+    <meta name="viewport" content="width=device-width,initial-scale=1"><meta name="generator" content="Docutils 0.19: https://docutils.sourceforge.io/" />
+
+      <title>5. Managing Input Data</title>
+    
+          <link rel="stylesheet" href="../_static/pygments.css" type="text/css" />
+          <link rel="stylesheet" href="../_static/theme.css " type="text/css" />
+          <link rel="stylesheet" href="../_static/togglebutton.css" type="text/css" />
+          <link rel="stylesheet" href="../_static/copybutton.css" type="text/css" />
+          <link rel="stylesheet" href="../_static/mystnb.4510f1fc1dee50b3e5859aac5469c37c29e427902b24a333a5f9fcb2f0b3ac41.css" type="text/css" />
+          <link rel="stylesheet" href="../_static/sphinx-thebe.css" type="text/css" />
+          <link rel="stylesheet" href="../_static/sphinx-design.4cbf315f70debaebd550c87a6162cf0f.min.css" type="text/css" />
+      
+      <!-- sphinx script_files -->
+        <script data-url_root="../" id="documentation_options" src="../_static/documentation_options.js"></script>
+        <script src="../_static/doctools.js"></script>
+        <script src="../_static/sphinx_highlight.js"></script>
+        <script src="../_static/clipboard.min.js"></script>
+        <script src="../_static/copybutton.js"></script>
+        <script src="../_static/scripts/sphinx-book-theme.js"></script>
+        <script>let toggleHintShow = 'Click to show';</script>
+        <script>let toggleHintHide = 'Click to hide';</script>
+        <script>let toggleOpenOnPrint = 'true';</script>
+        <script src="../_static/togglebutton.js"></script>
+        <script>var togglebuttonSelector = '.toggle, .admonition.dropdown';</script>
+        <script src="../_static/design-tabs.js"></script>
+        <script>const THEBE_JS_URL = "https://unpkg.com/thebe@0.8.2/lib/index.js"; const thebe_selector = ".thebe,.cell"; const thebe_selector_input = "pre"; const thebe_selector_output = ".output, .cell_output"</script>
+        <script async="async" src="../_static/sphinx-thebe.js"></script>
+
+      
+      <!-- bundled in js (rollup iife) -->
+      <!-- <script src="../_static/theme-vendors.js"></script> -->
+      <script src="../_static/theme.js" defer></script>
+    
+  <link rel="index" title="Index" href="../genindex.html" />
+  <link rel="search" title="Search" href="../search.html" />
+  <link rel="next" title="6. Safety" href="safety.html" />
+  <link rel="prev" title="4. Structured Output" href="structured_output.html" /> 
+  </head>
+
+  <body>
+    <div id="app">
+    <div class="theme-container" :class="pageClasses"><navbar @toggle-sidebar="toggleSidebar">
+  <router-link to="../markdown/toc.html" class="home-link">
+    
+      <span class="site-name">Taming LLMs</span>
+    
+  </router-link>
+
+  <div class="links">
+    <navlinks class="can-hide">
+
+
+
+  
+    <div class="nav-item">
+      <a href="https://tamingllm.substack.com/"
+        class="nav-link external">
+          Newsletter <outboundlink></outboundlink>
+      </a>
+    </div>
+  
+    <div class="nav-item">
+      <a href="https://github.com/souzatharsis/tamingllms"
+        class="nav-link external">
+          Github <outboundlink></outboundlink>
+      </a>
+    </div>
+  
+
+    </navlinks>
+  </div>
+</navbar>
+
+      
+      <div class="sidebar-mask" @click="toggleSidebar(false)">
+      </div>
+        <sidebar @toggle-sidebar="toggleSidebar">
+          
+          <navlinks>
+            
+
+
+
+  
+    <div class="nav-item">
+      <a href="https://tamingllm.substack.com/"
+        class="nav-link external">
+          Newsletter <outboundlink></outboundlink>
+      </a>
+    </div>
+  
+    <div class="nav-item">
+      <a href="https://github.com/souzatharsis/tamingllms"
+        class="nav-link external">
+          Github <outboundlink></outboundlink>
+      </a>
+    </div>
+  
+
+            
+          </navlinks><div id="searchbox" class="searchbox" role="search">
+  <div class="caption"><span class="caption-text">Quick search</span>
+    <div class="searchformwrapper">
+      <form class="search" action="../search.html" method="get">
+        <input type="text" name="q" />
+        <input type="submit" value="Search" />
+        <input type="hidden" name="check_keywords" value="yes" />
+        <input type="hidden" name="area" value="default" />
+      </form>
+    </div>
+  </div>
+</div><div class="sidebar-links" role="navigation" aria-label="main navigation">
+  
+    <div class="sidebar-group">
+      <p class="caption">
+        <span class="caption-text"><a href="../markdown/toc.html#taming-llms">taming llms</a></span>
+      </p>
+      <ul class="current">
+        
+          <li class="toctree-l1 ">
+            
+              <a href="../markdown/preface.html" class="reference internal ">Preface</a>
+            
+
+            
+          </li>
+
+        
+          <li class="toctree-l1 ">
+            
+              <a href="../markdown/intro.html" class="reference internal ">About the Book</a>
+            
+
+            
+          </li>
+
+        
+          <li class="toctree-l1 ">
+            
+              <a href="evals.html" class="reference internal ">The Evals Gap</a>
+            
+
+            
+          </li>
+
+        
+          <li class="toctree-l1 ">
+            
+              <a href="structured_output.html" class="reference internal ">Structured Output</a>
+            
+
+            
+          </li>
+
+        
+          <li class="toctree-l1 current">
+            
+              <a href="#" class="reference internal current">Managing Input Data</a>
+            
+
+            
+              <ul>
+                
+                  <li class="toctree-l2"><a href="#introduction" class="reference internal">Introduction</a></li>
+                
+                  <li class="toctree-l2"><a href="#parsing-documents" class="reference internal">Parsing Documents</a></li>
+                
+                  <li class="toctree-l2"><a href="#retrieval-augmented-generation" class="reference internal">Retrieval-Augmented Generation</a></li>
+                
+                  <li class="toctree-l2"><a href="#case-studies" class="reference internal">Case Studies</a></li>
+                
+                  <li class="toctree-l2"><a href="#conclusion" class="reference internal">Conclusion</a></li>
+                
+                  <li class="toctree-l2"><a href="#references" class="reference internal">References</a></li>
+                
+              </ul>
+            
+          </li>
+
+        
+          <li class="toctree-l1 ">
+            
+              <a href="safety.html" class="reference internal ">Safety</a>
+            
+
+            
+          </li>
+
+        
+          <li class="toctree-l1 ">
+            
+              <a href="alignment.html" class="reference internal ">Preference-Based Alignment</a>
+            
+
+            
+          </li>
+
+        
+          <li class="toctree-l1 ">
+            
+              <a href="local.html" class="reference internal ">Local LLMs in Practice</a>
+            
+
+            
+          </li>
+
+        
+          <li class="toctree-l1 ">
+            
+              <a href="cost.html" class="reference internal ">The Falling Cost Paradox</a>
+            
+
+            
+          </li>
+
+        
+      </ul>
+    </div>
+  
+</div>
+        </sidebar>
+
+      <page>
+          <div class="body-header" role="navigation" aria-label="navigation">
+  
+  <ul class="breadcrumbs">
+    <li><a href="../markdown/toc.html">Docs</a> &raquo;</li>
+    
+    <li><span class="section-number">5. </span>Managing Input Data</li>
+  </ul>
+  
+
+  <ul class="page-nav">
+  <li class="prev">
+    <a href="structured_output.html"
+       title="previous chapter">← <span class="section-number">4. </span>Structured Output</a>
+  </li>
+  <li class="next">
+    <a href="safety.html"
+       title="next chapter"><span class="section-number">6. </span>Safety →</a>
+  </li>
+</ul>
+  
+</div>
+<hr>
+          <div class="content" role="main" v-pre>
+            
+  <section class="tex2jax_ignore mathjax_ignore" id="managing-input-data">
+<span id="input"></span><h1><a class="toc-backref" href="#id206" role="doc-backlink"><span class="section-number">5. </span>Managing Input Data</a><a class="headerlink" href="#managing-input-data" title="Permalink to this heading">¶</a></h1>
+<blockquote class="epigraph">
+<div><p>One home run is much better than two doubles.</p>
+<p class="attribution">—Steve Jobs</p>
+</div></blockquote>
+<nav class="contents" id="contents">
+<p class="topic-title">Contents</p>
+<ul class="simple">
+<li><p><a class="reference internal" href="#managing-input-data" id="id206">Managing Input Data</a></p>
+<ul>
+<li><p><a class="reference internal" href="#introduction" id="id207">Introduction</a></p></li>
+<li><p><a class="reference internal" href="#parsing-documents" id="id208">Parsing Documents</a></p>
+<ul>
+<li><p><a class="reference internal" href="#markitdown" id="id209">MarkItDown</a></p></li>
+<li><p><a class="reference internal" href="#docling" id="id210">Docling</a></p></li>
+<li><p><a class="reference internal" href="#frameworks-based-parsing" id="id211">Frameworks-Based Parsing</a></p></li>
+<li><p><a class="reference internal" href="#structured-data-extraction" id="id212">Structured Data Extraction</a></p></li>
+</ul>
+</li>
+<li><p><a class="reference internal" href="#retrieval-augmented-generation" id="id213">Retrieval-Augmented Generation</a></p></li>
+<li><p><a class="reference internal" href="#case-studies" id="id214">Case Studies</a></p>
+<ul>
+<li><p><a class="reference internal" href="#case-study-i-content-chunking-with-contextual-linking" id="id215">Case Study I: Content Chunking with Contextual Linking</a></p>
+<ul>
+<li><p><a class="reference internal" href="#generating-long-form-content" id="id216">Generating long-form content</a></p></li>
+<li><p><a class="reference internal" href="#discussion" id="id217">Discussion</a></p></li>
+</ul>
+</li>
+<li><p><a class="reference internal" href="#case-study-ii-github-rag" id="id218">Case Study II: Github RAG</a></p></li>
+<li><p><a class="reference internal" href="#case-study-iii-quiz-generation-with-citations" id="id219">Case Study III: Quiz Generation with Citations</a></p>
+<ul>
+<li><p><a class="reference internal" href="#use-case" id="id220">Use Case</a></p></li>
+<li><p><a class="reference internal" href="#implementation" id="id221">Implementation</a></p></li>
+<li><p><a class="reference internal" href="#example-usage" id="id222">Example Usage</a></p></li>
+<li><p><a class="reference internal" href="#id14" id="id223">Discussion</a></p></li>
+</ul>
+</li>
+</ul>
+</li>
+<li><p><a class="reference internal" href="#conclusion" id="id224">Conclusion</a></p></li>
+<li><p><a class="reference internal" href="#references" id="id225">References</a></p></li>
+</ul>
+</li>
+</ul>
+</nav>
+<section id="introduction">
+<h2><a class="toc-backref" href="#id207" role="doc-backlink"><span class="section-number">5.1. </span>Introduction</a><a class="headerlink" href="#introduction" title="Permalink to this heading">¶</a></h2>
+<p>Large Language Models face several critical challenges in effectively processing input data. While advances in long-context language models (LCLMs) <span id="id1">[<a class="reference internal" href="#id101" title="Jinhyuk Lee, Anthony Chen, Zhuyun Dai, Dheeru Dua, Devendra Singh Sachan, Michael Boratko, Yi Luan, Sébastien M. R. Arnold, Vincent Perot, Siddharth Dalmia, Hexiang Hu, Xudong Lin, Panupong Pasupat, Aida Amini, Jeremy R. Cole, Sebastian Riedel, Iftekhar Naim, Ming-Wei Chang, and Kelvin Guu. Can long-context language models subsume retrieval, rag, sql, and more? 2024. URL: https://arxiv.org/abs/2406.13121, arXiv:2406.13121.">Lee <em>et al.</em>, 2024</a>]</span> have expanded the amount of information these systems can process simultaneously, significant challenges remain in managing and effectively utilizing extended inputs.</p>
+<p>LLMs are sensitive to input formatting and structure, requiring careful data preparation to achieve optimal results <span id="id2">[<a class="reference internal" href="#id143" title="Jiejun Tan, Zhicheng Dou, Wen Wang, Mang Wang, Weipeng Chen, and Ji-Rong Wen. Htmlrag: html is better than plain text for modeling retrieved knowledge in rag systems. 2024. URL: https://arxiv.org/abs/2411.02959, arXiv:2411.02959.">Tan <em>et al.</em>, 2024</a>]</span>. They operate with knowledge cutoffs, providing potentially stale or outdated information that may not reflect current reality and demonstrate problems with temporal knowledge accuracy <span id="id3">[<a class="reference internal" href="#id146" title="Alfonso Amayuelas, Kyle Wong, Liangming Pan, Wenhu Chen, and William Yang Wang. Knowledge of knowledge: exploring known-unknowns uncertainty with large language models. In Lun-Wei Ku, Andre Martins, and Vivek Srikumar, editors, Findings of the Association for Computational Linguistics: ACL 2024, 6416–6432. Bangkok, Thailand, August 2024. Association for Computational Linguistics. URL: https://aclanthology.org/2024.findings-acl.383, doi:10.18653/v1/2024.findings-acl.383.">Amayuelas <em>et al.</em>, 2024</a>]</span>. LLMs also struggle with less common but important information showing a systematic loss of long-tail knowledge <span id="id4">[<a class="reference internal" href="#id147" title="Suhas Kotha, Jacob Mitchell Springer, and Aditi Raghunathan. Understanding catastrophic forgetting in language models via implicit inference. In The Twelfth International Conference on Learning Representations. 2024. URL: https://openreview.net/forum?id=VrHiF2hsrm.">Kotha <em>et al.</em>, 2024</a>]</span>.</p>
+<p>Motivated by these challenges, this chapter explores two key components:</p>
+<ol class="arabic simple">
+<li><p>Data Parsing: Parsing documents into a unified format that is suitable for LLMs to process.</p></li>
+<li><p>Retrieval Augmentation: Augmenting LLMs with the ability to retrieve relevant, recent, and specialized information.</p></li>
+</ol>
+<p>In data parsing, we will explore some useful open source tools that help transform data into LLM-compatible formats, demonstrating their impact through a case study of structured information extraction from complex PDFs. In a second case study, we will introduce some chunking strategies to help LLMs process long inputs and implement a particular technique called Chunking with Contextual Linking the enables contextually relevant chunk processing.</p>
+<p>In retrieval augmentation, we will explore how to enhance LLMs with semantic search capabilities for incorporating external context using RAGs (Retrieval Augmented Generation). Through a detailed case study, we build a RAG system for querying live codebases, illustrating methods to bridge static model knowledge with dynamic information requirements.</p>
+<p>In our last case study, we build a quiz generator using a LLM with large context window. We will explore some additional relevant techniques such as prompt caching and response verification through citations.</p>
+<p>By the chapter’s conclusion, readers will possess relevant knowledge of input data management strategies for LLMs and practical expertise in selecting and implementing appropriate approaches and tools for specific use cases.</p>
+</section>
+<section id="parsing-documents">
+<h2><a class="toc-backref" href="#id208" role="doc-backlink"><span class="section-number">5.2. </span>Parsing Documents</a><a class="headerlink" href="#parsing-documents" title="Permalink to this heading">¶</a></h2>
+<p>Building robust data ingestion and preprocessing pipelines is essential for any LLM application. This section explores tools and frameworks that streamline input data processing, in particular for parsing purposes, providing a unified interface for converting diverse data formats into standardized representations that LLMs can effectively process. By abstracting away format-specific complexities, they allow developers to focus on core application logic rather than parsing implementation details while maximizing the performance of the LLM.</p>
+<p>We will cover open source tools and frameworks that provide parsing capabilities for a wide range of data formats. And we will demonstrate how some of these tools can be used to extract structured information from complex PDFs also discussing how the quality of the parser can impact LLM’s performance.</p>
+<section id="markitdown">
+<h3><a class="toc-backref" href="#id209" role="doc-backlink"><span class="section-number">5.2.1. </span>MarkItDown</a><a class="headerlink" href="#markitdown" title="Permalink to this heading">¶</a></h3>
+<p>MarkItDown is a Python package and CLI too developed by the Microsoft AutoGen team for converting various file formats to Markdown. It supports a wide range of formats including PDF, PowerPoint, Word, Excel, images (with OCR and EXIF metadata), audio (with transcription), HTML, and other text-based formats making it a useful tool for document indexing and LLM-based applications.</p>
+<p>Key features:</p>
+<ul class="simple">
+<li><p>Simple command-line and Python API interfaces</p></li>
+<li><p>Support for multiple file formats</p></li>
+<li><p>Optional LLM integration for enhanced image descriptions</p></li>
+<li><p>Batch processing capabilities</p></li>
+<li><p>Docker support for containerized usage</p></li>
+</ul>
+<p>Sample usage:</p>
+<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">markitdown</span> <span class="kn">import</span> <span class="n">MarkItDown</span>
+
+<span class="n">md</span> <span class="o">=</span> <span class="n">MarkItDown</span><span class="p">()</span>
+<span class="n">result</span> <span class="o">=</span> <span class="n">md</span><span class="o">.</span><span class="n">convert</span><span class="p">(</span><span class="s2">&quot;test.xlsx&quot;</span><span class="p">)</span>
+<span class="nb">print</span><span class="p">(</span><span class="n">result</span><span class="o">.</span><span class="n">text_content</span><span class="p">)</span>
+</pre></div>
+</div>
+</section>
+<section id="docling">
+<h3><a class="toc-backref" href="#id210" role="doc-backlink"><span class="section-number">5.2.2. </span>Docling</a><a class="headerlink" href="#docling" title="Permalink to this heading">¶</a></h3>
+<p>Docling is a Python package developed by IBM Research for parsing and converting documents into various formats. It provides advanced document understanding capabilities with a focus on maintaining document structure and formatting.</p>
+<p>Key features:</p>
+<ul class="simple">
+<li><p>Support for multiple document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, etc.)</p></li>
+<li><p>Advanced PDF parsing including layout analysis and table extraction</p></li>
+<li><p>Unified document representation format</p></li>
+<li><p>Integration with LlamaIndex and LangChain</p></li>
+<li><p>OCR support for scanned documents</p></li>
+<li><p>Simple CLI interface</p></li>
+</ul>
+<p>Sample usage:</p>
+<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">docling.document_converter</span> <span class="kn">import</span> <span class="n">DocumentConverter</span>
+
+<span class="n">converter</span> <span class="o">=</span> <span class="n">DocumentConverter</span><span class="p">()</span>
+<span class="n">result</span> <span class="o">=</span> <span class="n">converter</span><span class="o">.</span><span class="n">convert</span><span class="p">(</span><span class="s2">&quot;document.pdf&quot;</span><span class="p">)</span>
+<span class="nb">print</span><span class="p">(</span><span class="n">result</span><span class="o">.</span><span class="n">document</span><span class="o">.</span><span class="n">export_to_markdown</span><span class="p">())</span>
+</pre></div>
+</div>
+</section>
+<section id="frameworks-based-parsing">
+<h3><a class="toc-backref" href="#id211" role="doc-backlink"><span class="section-number">5.2.3. </span>Frameworks-Based Parsing</a><a class="headerlink" href="#frameworks-based-parsing" title="Permalink to this heading">¶</a></h3>
+</section>
+<section id="structured-data-extraction">
+<h3><a class="toc-backref" href="#id212" role="doc-backlink"><span class="section-number">5.2.4. </span>Structured Data Extraction</a><a class="headerlink" href="#structured-data-extraction" title="Permalink to this heading">¶</a></h3>
+<p>A common use case where document parsing matters is to structured data extraction from documents, particularly in the presence of complex formatting and layout. In this case study, we will extract the economic forecasts from Merrill Lynch’s CIO Capital Market Outlook released on December 16, 2024 <span id="id5">[<a class="reference internal" href="#id102" title="Merrill Lynch. Chief investment officer capital market outlook. CIO Weekly Letter, 2024. URL: https://olui2.fs.ml.com/publish/content/application/pdf/gwmol/me-cio-weekly-letter.pdf.">Merrill Lynch, 2024</a>]</span>. We will focus on page 7 of this document, which contains several economic variables organized in a mix of tables, text and images (see <a class="reference internal" href="#forecast"><span class="std std-numref">Fig. 5.1</span></a>)</p>
+<figure class="align-center" id="forecast">
+<a class="reference internal image-reference" href="../_images/forecast.png"><img alt="Forecast" src="../_images/forecast.png" style="width: 897.5px; height: 953.5px;" /></a>
+<figcaption>
+<p><span class="caption-number">Fig. 5.1 </span><span class="caption-text">Forecast</span><a class="headerlink" href="#forecast" title="Permalink to this image">¶</a></p>
+</figcaption>
+</figure>
+<div class="cell docutils container">
+<div class="cell_input docutils container">
+<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">FORECAST_FILE_PATH</span> <span class="o">=</span> <span class="s2">&quot;../data/input/forecast.pdf&quot;</span>
+</pre></div>
+</div>
+</div>
+</div>
+<p>First, we will use MarkItDown to extract the text content from the document.</p>
+<div class="cell docutils container">
+<div class="cell_input docutils container">
+<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">markitdown</span> <span class="kn">import</span> <span class="n">MarkItDown</span>
+
+<span class="n">md</span> <span class="o">=</span> <span class="n">MarkItDown</span><span class="p">()</span>
+<span class="n">result_md</span> <span class="o">=</span> <span class="n">md</span><span class="o">.</span><span class="n">convert</span><span class="p">(</span><span class="n">FORECAST_FILE_PATH</span><span class="p">)</span><span class="o">.</span><span class="n">text_content</span>
+</pre></div>
+</div>
+</div>
+</div>
+<p>Next, we will do the same with Docling.</p>
+<div class="cell docutils container">
+<div class="cell_input docutils container">
+<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">docling.document_converter</span> <span class="kn">import</span> <span class="n">DocumentConverter</span>
+
+<span class="n">converter</span> <span class="o">=</span> <span class="n">DocumentConverter</span><span class="p">()</span>
+<span class="n">forecast_result_docling</span> <span class="o">=</span> <span class="n">converter</span><span class="o">.</span><span class="n">convert</span><span class="p">(</span><span class="n">source</span><span class="p">)</span><span class="o">.</span><span class="n">document</span><span class="o">.</span><span class="n">export_to_markdown</span><span class="p">()</span>
+</pre></div>
+</div>
+</div>
+</div>
+<p>How similar are the two results? We can use use Levenshtein distance to measure the similarity between the two results. We will also calculate a naive score using the <code class="docutils literal notranslate"><span class="pre">SequenceMatcher</span></code> from the <code class="docutils literal notranslate"><span class="pre">difflib</span></code> package, which is a simple measure of the similarity between two strings based on the number of matches in the longest common subsequence.</p>
+<div class="cell docutils container">
+<div class="cell_input docutils container">
+<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">Levenshtein</span>
+<span class="k">def</span> <span class="nf">levenshtein_similarity</span><span class="p">(</span><span class="n">text1</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">text2</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">float</span><span class="p">:</span>
+<span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">    Calculate normalized Levenshtein distance</span>
+<span class="sd">    Returns value between 0 (completely different) and 1 (identical)</span>
+<span class="sd">    &quot;&quot;&quot;</span>
+    <span class="n">distance</span> <span class="o">=</span> <span class="n">Levenshtein</span><span class="o">.</span><span class="n">distance</span><span class="p">(</span><span class="n">text1</span><span class="p">,</span> <span class="n">text2</span><span class="p">)</span>
+    <span class="n">max_len</span> <span class="o">=</span> <span class="nb">max</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">text1</span><span class="p">),</span> <span class="nb">len</span><span class="p">(</span><span class="n">text2</span><span class="p">))</span>
+    <span class="k">return</span> <span class="mi">1</span> <span class="o">-</span> <span class="p">(</span><span class="n">distance</span> <span class="o">/</span> <span class="n">max_len</span><span class="p">)</span>
+
+<span class="kn">from</span> <span class="nn">difflib</span> <span class="kn">import</span> <span class="n">SequenceMatcher</span>
+<span class="k">def</span> <span class="nf">simple_similarity</span><span class="p">(</span><span class="n">text1</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">text2</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">float</span><span class="p">:</span>
+<span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">    Calculate similarity ratio using SequenceMatcher</span>
+<span class="sd">    Returns value between 0 (completely different) and 1 (identical)</span>
+<span class="sd">    &quot;&quot;&quot;</span>
+    <span class="k">return</span> <span class="n">SequenceMatcher</span><span class="p">(</span><span class="kc">None</span><span class="p">,</span> <span class="n">text1</span><span class="p">,</span> <span class="n">text2</span><span class="p">)</span><span class="o">.</span><span class="n">ratio</span><span class="p">()</span>
+</pre></div>
+</div>
+</div>
+</div>
+<div class="cell docutils container">
+<div class="cell_input docutils container">
+<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">levenshtein_similarity</span><span class="p">(</span><span class="n">forecast_result_md</span><span class="p">,</span> <span class="n">forecast_result_docling</span><span class="p">)</span>
+</pre></div>
+</div>
+</div>
+<div class="cell_output docutils container">
+<div class="output text_plain highlight-myst-ansi notranslate"><div class="highlight"><pre><span></span>0.13985705461925346
+</pre></div>
+</div>
+</div>
+</div>
+<div class="cell docutils container">
+<div class="cell_input docutils container">
+<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">simple_similarity</span><span class="p">(</span><span class="n">forecast_result_md</span><span class="p">,</span> <span class="n">forecast_result_docling</span><span class="p">)</span>
+</pre></div>
+</div>
+</div>
+<div class="cell_output docutils container">
+<div class="output text_plain highlight-myst-ansi notranslate"><div class="highlight"><pre><span></span>0.17779960707269155
+</pre></div>
+</div>
+</div>
+</div>
+<p>It turns out that the two results are quite different, with a similarity score of about 13.98% and 17.77% for Levenshtein and <code class="docutils literal notranslate"><span class="pre">SequenceMatcher</span></code> respectively.</p>
+<p>Docling’s result is a quite readable markdown displaying key economic variables and their forecasts. Conversely, MarkItDown’s result is a bit messy and hard to read but the information is there just not in a structured format. Does it matter? That’s what we will explore next.</p>
+<p><strong>Docling’s result</strong></p>
+<div class="cell docutils container">
+<div class="cell_input docutils container">
+<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">display</span><span class="p">(</span><span class="n">Markdown</span><span class="p">(</span><span class="n">forecast_result_docling</span><span class="p">))</span>
+</pre></div>
+</div>
+</div>
+</div>
+<p><a class="reference internal" href="#id6"><span class="std std-numref">Fig. 5.2</span></a> shows part of the parsed result from Docling.</p>
+<figure class="align-center" id="id6">
+<a class="reference internal image-reference" href="../_images/docling.png"><img alt="Docling's result" src="../_images/docling.png" style="width: 1106.3999999999999px; height: 661.8px;" /></a>
+<figcaption>
+<p><span class="caption-number">Fig. 5.2 </span><span class="caption-text">Docling’s parsed result</span><a class="headerlink" href="#id6" title="Permalink to this image">¶</a></p>
+</figcaption>
+</figure>
+<p><strong>MarkItDown’s result</strong></p>
+<div class="cell docutils container">
+<div class="cell_input docutils container">
+<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">IPython.display</span> <span class="kn">import</span> <span class="n">display</span><span class="p">,</span> <span class="n">Markdown</span>
+<span class="n">display</span><span class="p">(</span><span class="n">Markdown</span><span class="p">(</span><span class="n">forecast_result_md</span><span class="p">[:</span><span class="mi">500</span><span class="p">]))</span>
+</pre></div>
+</div>
+</div>
+</div>
+<p><a class="reference internal" href="#id7"><span class="std std-numref">Fig. 5.3</span></a> shows part of the parsed result from MarkItDown.</p>
+<figure class="align-center" id="id7">
+<a class="reference internal image-reference" href="../_images/markitdown.png"><img alt="MarkItDown's parsed result" src="../_images/markitdown.png" style="width: 1287.0px; height: 567.0px;" /></a>
+<figcaption>
+<p><span class="caption-number">Fig. 5.3 </span><span class="caption-text">MarkItDown’s parsed result</span><a class="headerlink" href="#id7" title="Permalink to this image">¶</a></p>
+</figcaption>
+</figure>
+<p>Now, let’s focus on the economic forecasts. In particular, we are interested in extracting the CIO’s 2025E forecasts.</p>
+<figure class="align-center" id="forecast2025">
+<a class="reference internal image-reference" href="../_images/2025.png"><img alt="Forecast 2025" src="../_images/2025.png" style="width: 1010.25px; height: 293.85px;" /></a>
+<figcaption>
+<p><span class="caption-number">Fig. 5.4 </span><span class="caption-text">Forecast 2025</span><a class="headerlink" href="#forecast2025" title="Permalink to this image">¶</a></p>
+</figcaption>
+</figure>
+<p>We will define a <code class="docutils literal notranslate"><span class="pre">Forecast</span></code> pydantic model to represent an economic forecast composed of a <code class="docutils literal notranslate"><span class="pre">financial_variable</span></code> and a <code class="docutils literal notranslate"><span class="pre">financial_forecast</span></code>. We will also define a <code class="docutils literal notranslate"><span class="pre">EconForecast</span></code> pydantic model to represent the list of economic forecasts we want to extract from the document.</p>
+<div class="cell docutils container">
+<div class="cell_input docutils container">
+<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">pydantic</span> <span class="kn">import</span> <span class="n">BaseModel</span>
+<span class="k">class</span> <span class="nc">Forecast</span><span class="p">(</span><span class="n">BaseModel</span><span class="p">):</span>
+    <span class="n">financial_variable</span><span class="p">:</span> <span class="nb">str</span>
+    <span class="n">financial_forecast</span><span class="p">:</span> <span class="nb">float</span>
+<span class="k">class</span> <span class="nc">EconForecast</span><span class="p">(</span><span class="n">BaseModel</span><span class="p">):</span>
+    <span class="n">forecasts</span><span class="p">:</span> <span class="nb">list</span><span class="p">[</span><span class="n">Forecast</span><span class="p">]</span>
+</pre></div>
+</div>
+</div>
+</div>
+<p>We write a simple function to extract the economic forecasts from the document using an LLM model (with structured output) with the following prompt template, where <code class="docutils literal notranslate"><span class="pre">extract_prompt</span></code> is kind of data the user would like to extract and <code class="docutils literal notranslate"><span class="pre">doc</span></code> is the input document to analyze.</p>
+<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="n">BASE_PROMPT</span> <span class="o">=</span> <span class="sa">f</span><span class="s2">&quot;&quot;&quot;</span>
+<span class="s2">    ROLE: You are an expert at structured data extraction. </span>
+<span class="s2">    TASK: Extract the following data </span><span class="si">{</span><span class="n">extract_prompt</span><span class="si">}</span><span class="s2"> from input DOCUMENT</span>
+<span class="s2">    FORMAT: The output should be a JSON object with &#39;financial_variable&#39; as key and &#39;financial_forecast&#39; as value.</span>
+<span class="s2">    &quot;&quot;&quot;</span>
+<span class="n">prompt</span> <span class="o">=</span> <span class="sa">f</span><span class="s2">&quot;</span><span class="si">{</span><span class="n">BASE_PROMPT</span><span class="si">}</span><span class="s2"> </span><span class="se">\n\n</span><span class="s2"> DOCUMENT: </span><span class="si">{</span><span class="n">doc</span><span class="si">}</span><span class="s2">&quot;</span>
+</pre></div>
+</div>
+<div class="cell docutils container">
+<div class="cell_input docutils container">
+<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="k">def</span> <span class="nf">extract_from_doc</span><span class="p">(</span><span class="n">extract_prompt</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span>  <span class="n">doc</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">client</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">EconForecast</span><span class="p">:</span>
+<span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">    Extract data of a financial document using an LLM model.</span>
+<span class="sd">    </span>
+<span class="sd">    Args:</span>
+<span class="sd">        doc: The financial document text to analyze</span>
+<span class="sd">        client: The LLM model to use for analysis</span>
+<span class="sd">        extract_prompt: The prompt to use for extraction</span>
+<span class="sd">        </span>
+<span class="sd">    Returns:</span>
+<span class="sd">        EconForecasts object containing sentiment analysis results</span>
+<span class="sd">    &quot;&quot;&quot;</span>
+
+    <span class="n">BASE_PROMPT</span> <span class="o">=</span> <span class="sa">f</span><span class="s2">&quot;&quot;&quot;</span>
+<span class="s2">    ROLE: You are an expert at structured data extraction. </span>
+<span class="s2">    TASK: Extract the following data </span><span class="si">{</span><span class="n">extract_prompt</span><span class="si">}</span><span class="s2"> from input DOCUMENT</span>
+<span class="s2">    FORMAT: The output should be a JSON object with &#39;financial_variable&#39; as key and &#39;financial_forecast&#39; as value.</span>
+<span class="s2">    &quot;&quot;&quot;</span>
+    <span class="n">prompt</span> <span class="o">=</span> <span class="sa">f</span><span class="s2">&quot;</span><span class="si">{</span><span class="n">BASE_PROMPT</span><span class="si">}</span><span class="s2"> </span><span class="se">\n\n</span><span class="s2"> DOCUMENT: </span><span class="si">{</span><span class="n">doc</span><span class="si">}</span><span class="s2">&quot;</span>
+    <span class="n">completion</span> <span class="o">=</span> <span class="n">client</span><span class="o">.</span><span class="n">beta</span><span class="o">.</span><span class="n">chat</span><span class="o">.</span><span class="n">completions</span><span class="o">.</span><span class="n">parse</span><span class="p">(</span>
+        <span class="n">model</span><span class="o">=</span><span class="s2">&quot;gpt-4o-mini&quot;</span><span class="p">,</span>
+        <span class="n">messages</span><span class="o">=</span><span class="p">[</span>
+            <span class="p">{</span>
+                <span class="s2">&quot;role&quot;</span><span class="p">:</span> <span class="s2">&quot;system&quot;</span><span class="p">,</span>
+                <span class="s2">&quot;content&quot;</span><span class="p">:</span> <span class="n">prompt</span>
+            <span class="p">},</span>
+            <span class="p">{</span><span class="s2">&quot;role&quot;</span><span class="p">:</span> <span class="s2">&quot;user&quot;</span><span class="p">,</span> <span class="s2">&quot;content&quot;</span><span class="p">:</span> <span class="n">doc</span><span class="p">}</span>
+        <span class="p">],</span>
+        <span class="n">response_format</span><span class="o">=</span><span class="n">EconForecast</span>
+    <span class="p">)</span>
+    <span class="k">return</span> <span class="n">completion</span><span class="o">.</span><span class="n">choices</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">message</span><span class="o">.</span><span class="n">parsed</span>
+</pre></div>
+</div>
+</div>
+</div>
+<div class="cell docutils container">
+<div class="cell_input docutils container">
+<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">dotenv</span> <span class="kn">import</span> <span class="n">load_dotenv</span>
+<span class="kn">import</span> <span class="nn">os</span>
+
+<span class="c1"># Load environment variables from .env file</span>
+<span class="n">load_dotenv</span><span class="p">(</span><span class="n">override</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
+<span class="kn">from</span> <span class="nn">openai</span> <span class="kn">import</span> <span class="n">OpenAI</span>
+<span class="n">client</span> <span class="o">=</span> <span class="n">OpenAI</span><span class="p">()</span>
+</pre></div>
+</div>
+</div>
+</div>
+<p>The user then calls the <code class="docutils literal notranslate"><span class="pre">extract_from_doc</span></code> function simply defining that “Economic Forecasts for 2025E” is the data they would like to extract from the document. We perform the extraction twice, once with MarkItDown and once with Docling.</p>
+<div class="cell docutils container">
+<div class="cell_input docutils container">
+<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">extract_prompt</span> <span class="o">=</span> <span class="s2">&quot;Economic Forecasts for 2025E&quot;</span>
+<span class="n">md_financials</span> <span class="o">=</span> <span class="n">extract_from_doc</span><span class="p">(</span><span class="n">extract_prompt</span><span class="p">,</span> <span class="n">forecast_result_md</span><span class="p">,</span> <span class="n">client</span><span class="p">)</span>
+<span class="n">docling_financials</span> <span class="o">=</span> <span class="n">extract_from_doc</span><span class="p">(</span><span class="n">extract_prompt</span><span class="p">,</span> <span class="n">forecast_result_docling</span><span class="p">,</span> <span class="n">client</span><span class="p">)</span>
+</pre></div>
+</div>
+</div>
+</div>
+<p>The response is an <code class="docutils literal notranslate"><span class="pre">EconForecast</span></code> object containing a list of <code class="docutils literal notranslate"><span class="pre">Forecast</span></code> objects, as defined in the pydantic model. We can then convert the response to a pandas DataFrame for easier comparison.</p>
+<div class="cell docutils container">
+<div class="cell_input docutils container">
+<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">md_financials</span>
+</pre></div>
+</div>
+</div>
+<div class="cell_output docutils container">
+<div class="output text_plain highlight-myst-ansi notranslate"><div class="highlight"><pre><span></span>EconForecast(forecasts=[Forecast(financial_variable=&#39;Real global GDP (% y/y annualized)&#39;, financial_forecast=3.2), Forecast(financial_variable=&#39;Real U.S. GDP (% q/q annualized)&#39;, financial_forecast=2.4), Forecast(financial_variable=&#39;CPI inflation (% y/y)&#39;, financial_forecast=2.5), Forecast(financial_variable=&#39;Core CPI inflation (% y/y)&#39;, financial_forecast=3.0), Forecast(financial_variable=&#39;Unemployment rate (%)&#39;, financial_forecast=4.3), Forecast(financial_variable=&#39;Fed funds rate, end period (%)&#39;, financial_forecast=3.88)])
+</pre></div>
+</div>
+</div>
+</div>
+<div class="cell docutils container">
+<div class="cell_input docutils container">
+<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">df_md_forecasts</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">([(</span><span class="n">f</span><span class="o">.</span><span class="n">financial_variable</span><span class="p">,</span> <span class="n">f</span><span class="o">.</span><span class="n">financial_forecast</span><span class="p">)</span> <span class="k">for</span> <span class="n">f</span> <span class="ow">in</span> <span class="n">md_financials</span><span class="o">.</span><span class="n">forecasts</span><span class="p">],</span> 
+                      <span class="n">columns</span><span class="o">=</span><span class="p">[</span><span class="s1">&#39;Variable&#39;</span><span class="p">,</span> <span class="s1">&#39;Forecast&#39;</span><span class="p">])</span>
+<span class="n">df_docling_forecasts</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">([(</span><span class="n">f</span><span class="o">.</span><span class="n">financial_variable</span><span class="p">,</span> <span class="n">f</span><span class="o">.</span><span class="n">financial_forecast</span><span class="p">)</span> <span class="k">for</span> <span class="n">f</span> <span class="ow">in</span> <span class="n">docling_financials</span><span class="o">.</span><span class="n">forecasts</span><span class="p">],</span> 
+                      <span class="n">columns</span><span class="o">=</span><span class="p">[</span><span class="s1">&#39;Variable&#39;</span><span class="p">,</span> <span class="s1">&#39;Forecast&#39;</span><span class="p">])</span>
+</pre></div>
+</div>
+</div>
+</div>
+<div class="cell docutils container">
+<div class="cell_input docutils container">
+<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">df_md_forecasts</span>
+</pre></div>
+</div>
+</div>
+<div class="cell_output docutils container">
+<div class="output text_html"><div>
+<style scoped>
+    .dataframe tbody tr th:only-of-type {
+        vertical-align: middle;
+    }
+
+    .dataframe tbody tr th {
+        vertical-align: top;
+    }
+
+    .dataframe thead th {
+        text-align: right;
+    }
+</style>
+<table border="1" class="dataframe">
+  <thead>
+    <tr style="text-align: right;">
+      <th></th>
+      <th>Variable</th>
+      <th>Forecast</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <th>0</th>
+      <td>Real global GDP (% y/y annualized)</td>
+      <td>3.20</td>
+    </tr>
+    <tr>
+      <th>1</th>
+      <td>Real U.S. GDP (% q/q annualized)</td>
+      <td>2.40</td>
+    </tr>
+    <tr>
+      <th>2</th>
+      <td>CPI inflation (% y/y)</td>
+      <td>2.50</td>
+    </tr>
+    <tr>
+      <th>3</th>
+      <td>Core CPI inflation (% y/y)</td>
+      <td>3.00</td>
+    </tr>
+    <tr>
+      <th>4</th>
+      <td>Unemployment rate (%)</td>
+      <td>4.30</td>
+    </tr>
+    <tr>
+      <th>5</th>
+      <td>Fed funds rate, end period (%)</td>
+      <td>3.88</td>
+    </tr>
+  </tbody>
+</table>
+</div></div></div>
+</div>
+<div class="cell docutils container">
+<div class="cell_input docutils container">
+<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">df_docling_forecasts</span>
+</pre></div>
+</div>
+</div>
+<div class="cell_output docutils container">
+<div class="output text_html"><div>
+<style scoped>
+    .dataframe tbody tr th:only-of-type {
+        vertical-align: middle;
+    }
+
+    .dataframe tbody tr th {
+        vertical-align: top;
+    }
+
+    .dataframe thead th {
+        text-align: right;
+    }
+</style>
+<table border="1" class="dataframe">
+  <thead>
+    <tr style="text-align: right;">
+      <th></th>
+      <th>Variable</th>
+      <th>Forecast</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <th>0</th>
+      <td>Real global GDP (% y/y annualized)</td>
+      <td>3.20</td>
+    </tr>
+    <tr>
+      <th>1</th>
+      <td>Real U.S. GDP (% q/q annualized)</td>
+      <td>2.40</td>
+    </tr>
+    <tr>
+      <th>2</th>
+      <td>CPI inflation (% y/y)</td>
+      <td>2.50</td>
+    </tr>
+    <tr>
+      <th>3</th>
+      <td>Core CPI inflation (% y/y)</td>
+      <td>3.00</td>
+    </tr>
+    <tr>
+      <th>4</th>
+      <td>Unemployment rate (%)</td>
+      <td>4.30</td>
+    </tr>
+    <tr>
+      <th>5</th>
+      <td>Fed funds rate, end period (%)</td>
+      <td>3.88</td>
+    </tr>
+  </tbody>
+</table>
+</div></div></div>
+</div>
+<p>The results from MarkItDown and Docling are identical and accurately match the true values from the document. This demonstrates that despite MarkItDown’s output appearing less readable from a human perspective, both approaches enabled the LLM to successfully extract the economic forecast data with equal accuracy, in this particular case.</p>
+<p>Now, let’s focus on the asset class weightings. We will extract the asset class weightings from the document and compare the results from MarkItDown and Docling. The information now is presented in a quite different structure. The CIO view information is represented in a spectrum from starting with “Underweight”, passing through “Neutral” and reaching “Overweight”. The actual view is marked by some colored dots in the chart. Let’s see if we can extract this information from the document.</p>
+<figure class="align-center" id="asset-class">
+<a class="reference internal image-reference" href="../_images/asset_class.png"><img alt="Asset Class Weightings" src="../_images/asset_class.png" style="width: 575.0px; height: 739.5px;" /></a>
+<figcaption>
+<p><span class="caption-number">Fig. 5.5 </span><span class="caption-text">Asset Class Weightings</span><a class="headerlink" href="#asset-class" title="Permalink to this image">¶</a></p>
+</figcaption>
+</figure>
+<p>The user will simply define the following data to extract: “Asset Class Weightings (as of 12/3/2024) in a scale from -2 to 2”. In that way, we expect that “Underweight” will be mapped to -2, “Neutral” to 0 and “Overweight” to 2 with some values in between.</p>
+<div class="cell docutils container">
+<div class="cell_input docutils container">
+<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">extract_prompt</span> <span class="o">=</span> <span class="s2">&quot;Asset Class Weightings (as of 12/3/2024) in a scale from -2 to 2&quot;</span>
+<span class="n">asset_class_docling</span> <span class="o">=</span> <span class="n">extract_from_doc</span><span class="p">(</span><span class="n">extract_prompt</span><span class="p">,</span> <span class="n">forecast_result_docling</span><span class="p">,</span> <span class="n">client</span><span class="p">)</span>
+<span class="n">asset_class_md</span> <span class="o">=</span> <span class="n">extract_from_doc</span><span class="p">(</span><span class="n">extract_prompt</span><span class="p">,</span> <span class="n">forecast_result_md</span><span class="p">,</span> <span class="n">client</span><span class="p">)</span>
+</pre></div>
+</div>
+</div>
+</div>
+<div class="cell docutils container">
+<div class="cell_input docutils container">
+<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">df_md</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">([(</span><span class="n">f</span><span class="o">.</span><span class="n">financial_variable</span><span class="p">,</span> <span class="n">f</span><span class="o">.</span><span class="n">financial_forecast</span><span class="p">)</span> <span class="k">for</span> <span class="n">f</span> <span class="ow">in</span> <span class="n">asset_class_md</span><span class="o">.</span><span class="n">forecasts</span><span class="p">],</span> 
+                 <span class="n">columns</span><span class="o">=</span><span class="p">[</span><span class="s1">&#39;Variable&#39;</span><span class="p">,</span> <span class="s1">&#39;Forecast&#39;</span><span class="p">])</span>
+<span class="n">df_docling</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">([(</span><span class="n">f</span><span class="o">.</span><span class="n">financial_variable</span><span class="p">,</span> <span class="n">f</span><span class="o">.</span><span class="n">financial_forecast</span><span class="p">)</span> <span class="k">for</span> <span class="n">f</span> <span class="ow">in</span> <span class="n">asset_class_docling</span><span class="o">.</span><span class="n">forecasts</span><span class="p">],</span> 
+                 <span class="n">columns</span><span class="o">=</span><span class="p">[</span><span class="s1">&#39;Variable&#39;</span><span class="p">,</span> <span class="s1">&#39;Forecast&#39;</span><span class="p">])</span>
+</pre></div>
+</div>
+</div>
+</div>
+<p>Now we construct a DataFrame to compare the results from MarkItDown and Docling with an added “true_value” column containing the true values from the document, which we extracted manually from the chart.</p>
+<div class="cell docutils container">
+<div class="cell_input docutils container">
+<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="c1"># Create DataFrame with specified columns</span>
+<span class="n">df_comparison</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">({</span>
+    <span class="s1">&#39;variable&#39;</span><span class="p">:</span> <span class="n">df_docling</span><span class="p">[</span><span class="s1">&#39;Variable&#39;</span><span class="p">]</span><span class="o">.</span><span class="n">iloc</span><span class="p">[:</span><span class="o">-</span><span class="mi">1</span><span class="p">],</span>
+    <span class="s1">&#39;markitdown&#39;</span><span class="p">:</span> <span class="n">df_md</span><span class="p">[</span><span class="s1">&#39;Forecast&#39;</span><span class="p">],</span>
+    <span class="s1">&#39;docling&#39;</span><span class="p">:</span> <span class="n">df_docling</span><span class="p">[</span><span class="s1">&#39;Forecast&#39;</span><span class="p">]</span><span class="o">.</span><span class="n">iloc</span><span class="p">[:</span><span class="o">-</span><span class="mi">1</span><span class="p">],</span>  <span class="c1"># Drop last row</span>
+    <span class="s1">&#39;true_value&#39;</span><span class="p">:</span> <span class="p">[</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">,</span> <span class="o">-</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">,</span> <span class="o">-</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">,</span> <span class="o">-</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">,</span> <span class="o">-</span><span class="mf">1.0</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">,</span> <span class="o">-</span><span class="mf">1.0</span><span class="p">]</span>
+<span class="p">})</span>
+
+<span class="n">display</span><span class="p">(</span><span class="n">df_comparison</span><span class="p">)</span>
+</pre></div>
+</div>
+</div>
+<div class="cell_output docutils container">
+<div class="output text_html"><div>
+<style scoped>
+    .dataframe tbody tr th:only-of-type {
+        vertical-align: middle;
+    }
+
+    .dataframe tbody tr th {
+        vertical-align: top;
+    }
+
+    .dataframe thead th {
+        text-align: right;
+    }
+</style>
+<table border="1" class="dataframe">
+  <thead>
+    <tr style="text-align: right;">
+      <th></th>
+      <th>variable</th>
+      <th>markitdown</th>
+      <th>docling</th>
+      <th>true_value</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <th>0</th>
+      <td>Global Equities</td>
+      <td>1.0</td>
+      <td>1.0</td>
+      <td>1.0</td>
+    </tr>
+    <tr>
+      <th>1</th>
+      <td>U.S. Large Cap Growth</td>
+      <td>1.0</td>
+      <td>1.0</td>
+      <td>0.0</td>
+    </tr>
+    <tr>
+      <th>2</th>
+      <td>U.S. Large Cap Value</td>
+      <td>1.0</td>
+      <td>1.0</td>
+      <td>1.0</td>
+    </tr>
+    <tr>
+      <th>3</th>
+      <td>U.S. Small Cap Growth</td>
+      <td>1.0</td>
+      <td>1.0</td>
+      <td>1.0</td>
+    </tr>
+    <tr>
+      <th>4</th>
+      <td>U.S. Small Cap Value</td>
+      <td>1.0</td>
+      <td>1.0</td>
+      <td>1.0</td>
+    </tr>
+    <tr>
+      <th>5</th>
+      <td>International Developed</td>
+      <td>1.0</td>
+      <td>-1.0</td>
+      <td>-1.0</td>
+    </tr>
+    <tr>
+      <th>6</th>
+      <td>Emerging Markets</td>
+      <td>1.0</td>
+      <td>0.0</td>
+      <td>0.0</td>
+    </tr>
+    <tr>
+      <th>7</th>
+      <td>Global Fixed Income</td>
+      <td>-1.0</td>
+      <td>-1.0</td>
+      <td>-1.0</td>
+    </tr>
+    <tr>
+      <th>8</th>
+      <td>U.S. Governments</td>
+      <td>-1.0</td>
+      <td>1.0</td>
+      <td>1.0</td>
+    </tr>
+    <tr>
+      <th>9</th>
+      <td>U.S. Mortgages</td>
+      <td>-1.0</td>
+      <td>1.0</td>
+      <td>1.0</td>
+    </tr>
+    <tr>
+      <th>10</th>
+      <td>U.S. Corporates</td>
+      <td>-1.0</td>
+      <td>-1.0</td>
+      <td>-1.0</td>
+    </tr>
+    <tr>
+      <th>11</th>
+      <td>International Fixed Income</td>
+      <td>-1.0</td>
+      <td>0.0</td>
+      <td>0.0</td>
+    </tr>
+    <tr>
+      <th>12</th>
+      <td>High Yield</td>
+      <td>-1.0</td>
+      <td>-1.0</td>
+      <td>-1.0</td>
+    </tr>
+    <tr>
+      <th>13</th>
+      <td>U.S. Investment-grade</td>
+      <td>-1.0</td>
+      <td>0.0</td>
+      <td>0.0</td>
+    </tr>
+    <tr>
+      <th>14</th>
+      <td>Tax Exempt U.S. High Yield Tax Exempt</td>
+      <td>-1.0</td>
+      <td>-1.0</td>
+      <td>-1.0</td>
+    </tr>
+  </tbody>
+</table>
+</div></div></div>
+</div>
+<div class="cell docutils container">
+<div class="cell_input docutils container">
+<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="c1"># Calculate accuracy for markitdown and docling</span>
+<span class="n">markitdown_accuracy</span> <span class="o">=</span> <span class="p">(</span><span class="n">df_comparison</span><span class="p">[</span><span class="s1">&#39;markitdown&#39;</span><span class="p">]</span> <span class="o">==</span> <span class="n">df_comparison</span><span class="p">[</span><span class="s1">&#39;true_value&#39;</span><span class="p">])</span><span class="o">.</span><span class="n">mean</span><span class="p">()</span>
+<span class="n">docling_accuracy</span> <span class="o">=</span> <span class="p">(</span><span class="n">df_comparison</span><span class="p">[</span><span class="s1">&#39;docling&#39;</span><span class="p">]</span> <span class="o">==</span> <span class="n">df_comparison</span><span class="p">[</span><span class="s1">&#39;true_value&#39;</span><span class="p">])</span><span class="o">.</span><span class="n">mean</span><span class="p">()</span>
+
+<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Markitdown accuracy: </span><span class="si">{</span><span class="n">markitdown_accuracy</span><span class="si">:</span><span class="s2">.2%</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Docling accuracy: </span><span class="si">{</span><span class="n">docling_accuracy</span><span class="si">:</span><span class="s2">.2%</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span> 
+</pre></div>
+</div>
+</div>
+<div class="cell_output docutils container">
+<div class="output stream highlight-myst-ansi notranslate"><div class="highlight"><pre><span></span>Markitdown accuracy: 53.33%
+Docling accuracy: 93.33%
+</pre></div>
+</div>
+</div>
+</div>
+<p>Docling performs significantly better at 93.33% accuracy missing only one value. MarkItDown achieves 53.33% accuracy, struggling with nuanced asset class weightings. In this case, Docling’s structured parsed output did help the LLM to extract the information more accurately compared to MarkItDown’s unstructured output. Hence, in this case, the strategy used to parse the data did impact the LLM’s ability to extract the information. A more robust analysis would run data extraction on a large sample data a number of repeated runs to estimate error rates.</p>
+<p>What if we want to systematically extract all tables from the document? We can use Docling to do that by simply accessing the <code class="docutils literal notranslate"><span class="pre">tables</span></code> attribute of the <code class="docutils literal notranslate"><span class="pre">DocumentConverter</span></code> object.</p>
+<p>By doing that, we observe that Docling extracted 7 tables from the document. Exporting tables from top down and left to right in order of appearance in the document.
+Below, we can see the first table successfully extracted for Equities forecasts, the second one for Fixed Income forecasts as well as the last table, which contains CIO Equity Sector Views.</p>
+<div class="cell docutils container">
+<div class="cell_input docutils container">
+<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">time</span>
+<span class="kn">from</span> <span class="nn">pathlib</span> <span class="kn">import</span> <span class="n">Path</span>
+<span class="kn">import</span> <span class="nn">pandas</span> <span class="k">as</span> <span class="nn">pd</span>
+<span class="kn">from</span> <span class="nn">docling.document_converter</span> <span class="kn">import</span> <span class="n">DocumentConverter</span>
+</pre></div>
+</div>
+</div>
+</div>
+<div class="cell docutils container">
+<div class="cell_input docutils container">
+<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="k">def</span> <span class="nf">convert_and_export_tables</span><span class="p">(</span><span class="n">file_path</span><span class="p">:</span> <span class="n">Path</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">list</span><span class="p">[</span><span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">]:</span>
+<span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">    Convert document and export tables to DataFrames.</span>
+<span class="sd">    </span>
+<span class="sd">    Args:</span>
+<span class="sd">        file_path: Path to input document</span>
+<span class="sd">        </span>
+<span class="sd">    Returns:</span>
+<span class="sd">        List of pandas DataFrames containing the tables</span>
+<span class="sd">    &quot;&quot;&quot;</span>
+    <span class="n">doc_converter</span> <span class="o">=</span> <span class="n">DocumentConverter</span><span class="p">()</span>
+    <span class="n">start_time</span> <span class="o">=</span> <span class="n">time</span><span class="o">.</span><span class="n">time</span><span class="p">()</span>
+    
+    <span class="n">conv_res</span> <span class="o">=</span> <span class="n">doc_converter</span><span class="o">.</span><span class="n">convert</span><span class="p">(</span><span class="n">file_path</span><span class="p">)</span>
+    
+    <span class="n">tables</span> <span class="o">=</span> <span class="p">[]</span>
+    <span class="c1"># Export tables</span>
+    <span class="k">for</span> <span class="n">table</span> <span class="ow">in</span> <span class="n">conv_res</span><span class="o">.</span><span class="n">document</span><span class="o">.</span><span class="n">tables</span><span class="p">:</span>
+        <span class="n">table_df</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span> <span class="o">=</span> <span class="n">table</span><span class="o">.</span><span class="n">export_to_dataframe</span><span class="p">()</span>
+        <span class="n">tables</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">table_df</span><span class="p">)</span>
+
+    <span class="n">end_time</span> <span class="o">=</span> <span class="n">time</span><span class="o">.</span><span class="n">time</span><span class="p">()</span> <span class="o">-</span> <span class="n">start_time</span>
+    <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Document converted in </span><span class="si">{</span><span class="n">end_time</span><span class="si">:</span><span class="s2">.2f</span><span class="si">}</span><span class="s2"> seconds.&quot;</span><span class="p">)</span>
+    
+    <span class="k">return</span> <span class="n">tables</span>
+</pre></div>
+</div>
+</div>
+</div>
+<div class="cell docutils container">
+<div class="cell_input docutils container">
+<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="c1"># Convert and export tables</span>
+<span class="n">tables</span> <span class="o">=</span> <span class="n">convert_and_export_tables</span><span class="p">(</span><span class="n">Path</span><span class="p">(</span><span class="n">FORECAST_FILE_PATH</span><span class="p">))</span>
+</pre></div>
+</div>
+</div>
+</div>
+<div class="cell docutils container">
+<div class="cell_input docutils container">
+<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="nb">len</span><span class="p">(</span><span class="n">tables</span><span class="p">)</span>
+</pre></div>
+</div>
+</div>
+<div class="cell_output docutils container">
+<div class="output text_plain highlight-myst-ansi notranslate"><div class="highlight"><pre><span></span>7
+</pre></div>
+</div>
+</div>
+</div>
+<div class="cell docutils container">
+<div class="cell_input docutils container">
+<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">display</span><span class="p">(</span><span class="n">tables</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span>
+</pre></div>
+</div>
+</div>
+<div class="cell_output docutils container">
+<div class="output text_html"><div>
+<style scoped>
+    .dataframe tbody tr th:only-of-type {
+        vertical-align: middle;
+    }
+
+    .dataframe tbody tr th {
+        vertical-align: top;
+    }
+
+    .dataframe thead th {
+        text-align: right;
+    }
+</style>
+<table border="1" class="dataframe">
+  <thead>
+    <tr style="text-align: right;">
+      <th></th>
+      <th></th>
+      <th>Total Return in USD (%).Current</th>
+      <th>Total Return in USD (%).WTD</th>
+      <th>Total Return in USD (%).MTD</th>
+      <th>Total Return in USD (%).YTD</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <th>0</th>
+      <td>DJIA</td>
+      <td>43,828.06</td>
+      <td>-1.8</td>
+      <td>-2.3</td>
+      <td>18.4</td>
+    </tr>
+    <tr>
+      <th>1</th>
+      <td>NASDAQ</td>
+      <td>19,926.72</td>
+      <td>0.4</td>
+      <td>3.7</td>
+      <td>33.7</td>
+    </tr>
+    <tr>
+      <th>2</th>
+      <td>S&amp;P 500</td>
+      <td>6,051.09</td>
+      <td>-0.6</td>
+      <td>0.4</td>
+      <td>28.6</td>
+    </tr>
+    <tr>
+      <th>3</th>
+      <td>S&amp;P 400 Mid Cap</td>
+      <td>3,277.20</td>
+      <td>-1.6</td>
+      <td>-2.6</td>
+      <td>19.5</td>
+    </tr>
+    <tr>
+      <th>4</th>
+      <td>Russell 2000</td>
+      <td>2,346.90</td>
+      <td>-2.5</td>
+      <td>-3.5</td>
+      <td>17.3</td>
+    </tr>
+    <tr>
+      <th>5</th>
+      <td>MSCI World</td>
+      <td>3,817.24</td>
+      <td>-1.0</td>
+      <td>0.2</td>
+      <td>22.1</td>
+    </tr>
+    <tr>
+      <th>6</th>
+      <td>MSCI EAFE</td>
+      <td>2,319.05</td>
+      <td>-1.5</td>
+      <td>0.2</td>
+      <td>6.4</td>
+    </tr>
+    <tr>
+      <th>7</th>
+      <td>MSCI Emerging Markets</td>
+      <td>1,107.01</td>
+      <td>0.3</td>
+      <td>2.7</td>
+      <td>10.6</td>
+    </tr>
+  </tbody>
+</table>
+</div></div></div>
+</div>
+<div class="cell docutils container">
+<div class="cell_input docutils container">
+<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">display</span><span class="p">(</span><span class="n">tables</span><span class="p">[</span><span class="mi">1</span><span class="p">])</span>
+</pre></div>
+</div>
+</div>
+<div class="cell_output docutils container">
+<div class="output text_html"><div>
+<style scoped>
+    .dataframe tbody tr th:only-of-type {
+        vertical-align: middle;
+    }
+
+    .dataframe tbody tr th {
+        vertical-align: top;
+    }
+
+    .dataframe thead th {
+        text-align: right;
+    }
+</style>
+<table border="1" class="dataframe">
+  <thead>
+    <tr style="text-align: right;">
+      <th></th>
+      <th></th>
+      <th>Total Return in USD (%).Current</th>
+      <th>Total Return in USD (%).WTD</th>
+      <th>Total Return in USD (%).MTD</th>
+      <th>Total Return in USD (%).YTD</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <th>0</th>
+      <td>Corporate &amp; Government</td>
+      <td>4.66</td>
+      <td>-1.34</td>
+      <td>-0.92</td>
+      <td>1.94</td>
+    </tr>
+    <tr>
+      <th>1</th>
+      <td>Agencies</td>
+      <td>4.54</td>
+      <td>-0.58</td>
+      <td>-0.31</td>
+      <td>3.35</td>
+    </tr>
+    <tr>
+      <th>2</th>
+      <td>Municipals</td>
+      <td>3.55</td>
+      <td>-0.87</td>
+      <td>-0.54</td>
+      <td>1.99</td>
+    </tr>
+    <tr>
+      <th>3</th>
+      <td>U.S. Investment Grade Credit</td>
+      <td>4.79</td>
+      <td>-1.38</td>
+      <td>-0.93</td>
+      <td>1.97</td>
+    </tr>
+    <tr>
+      <th>4</th>
+      <td>International</td>
+      <td>5.17</td>
+      <td>-1.40</td>
+      <td>-0.90</td>
+      <td>3.20</td>
+    </tr>
+    <tr>
+      <th>5</th>
+      <td>High Yield</td>
+      <td>7.19</td>
+      <td>-0.22</td>
+      <td>0.20</td>
+      <td>8.87</td>
+    </tr>
+    <tr>
+      <th>6</th>
+      <td>90 Day Yield</td>
+      <td>4.32</td>
+      <td>4.39</td>
+      <td>4.49</td>
+      <td>5.33</td>
+    </tr>
+    <tr>
+      <th>7</th>
+      <td>2 Year Yield</td>
+      <td>4.24</td>
+      <td>4.10</td>
+      <td>4.15</td>
+      <td>4.25</td>
+    </tr>
+    <tr>
+      <th>8</th>
+      <td>10 Year Yield</td>
+      <td>4.40</td>
+      <td>4.15</td>
+      <td>4.17</td>
+      <td>3.88</td>
+    </tr>
+    <tr>
+      <th>9</th>
+      <td>30 Year Yield</td>
+      <td>4.60</td>
+      <td>4.34</td>
+      <td>4.36</td>
+      <td>4.03</td>
+    </tr>
+  </tbody>
+</table>
+</div></div></div>
+</div>
+<div class="cell docutils container">
+<div class="cell_input docutils container">
+<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">display</span><span class="p">(</span><span class="n">tables</span><span class="p">[</span><span class="mi">6</span><span class="p">])</span>
+</pre></div>
+</div>
+</div>
+<div class="cell_output docutils container">
+<div class="output text_html"><div>
+<style scoped>
+    .dataframe tbody tr th:only-of-type {
+        vertical-align: middle;
+    }
+
+    .dataframe tbody tr th {
+        vertical-align: top;
+    }
+
+    .dataframe thead th {
+        text-align: right;
+    }
+</style>
+<table border="1" class="dataframe">
+  <thead>
+    <tr style="text-align: right;">
+      <th></th>
+      <th>Sector</th>
+      <th>CIO View.</th>
+      <th>CIO View.Underweight</th>
+      <th>CIO View.Neutral</th>
+      <th>CIO View.</th>
+      <th>CIO View.Overweight</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <th>0</th>
+      <td>Utilities</td>
+      <td>slight over weight green   </td>
+      <td></td>
+      <td></td>
+      <td></td>
+      <td></td>
+    </tr>
+    <tr>
+      <th>1</th>
+      <td>Financials</td>
+      <td>slight over weight green   </td>
+      <td></td>
+      <td></td>
+      <td></td>
+      <td></td>
+    </tr>
+    <tr>
+      <th>2</th>
+      <td>Healthcare</td>
+      <td>slight over weight green   </td>
+      <td></td>
+      <td></td>
+      <td></td>
+      <td></td>
+    </tr>
+    <tr>
+      <th>3</th>
+      <td>Consumer  Discretionary</td>
+      <td>Slight over weight green  </td>
+      <td></td>
+      <td></td>
+      <td></td>
+      <td></td>
+    </tr>
+    <tr>
+      <th>4</th>
+      <td>Information  Technology</td>
+      <td>Neutral yellow  </td>
+      <td></td>
+      <td></td>
+      <td></td>
+      <td></td>
+    </tr>
+    <tr>
+      <th>5</th>
+      <td>Communication  Services</td>
+      <td>Neutral yellow  </td>
+      <td></td>
+      <td></td>
+      <td></td>
+      <td></td>
+    </tr>
+    <tr>
+      <th>6</th>
+      <td>Industrials</td>
+      <td>Neutral yellow  </td>
+      <td></td>
+      <td></td>
+      <td></td>
+      <td></td>
+    </tr>
+    <tr>
+      <th>7</th>
+      <td>Real Estate</td>
+      <td>Neutral yellow  </td>
+      <td></td>
+      <td></td>
+      <td></td>
+      <td></td>
+    </tr>
+    <tr>
+      <th>8</th>
+      <td>Energy</td>
+      <td>slight underweight orange  </td>
+      <td></td>
+      <td></td>
+      <td></td>
+      <td></td>
+    </tr>
+    <tr>
+      <th>9</th>
+      <td>Materials</td>
+      <td>slight underweight orange  </td>
+      <td></td>
+      <td></td>
+      <td></td>
+      <td></td>
+    </tr>
+    <tr>
+      <th>10</th>
+      <td>Consumer  Staples</td>
+      <td>underweight red</td>
+      <td></td>
+      <td></td>
+      <td></td>
+      <td></td>
+    </tr>
+  </tbody>
+</table>
+</div></div></div>
+</div>
+<p>Coming back to MarkItDown, one interesting feature to explore is the ability to extract information from images by passing an image capable LLM model to its constructor.</p>
+<div class="cell docutils container">
+<div class="cell_input docutils container">
+<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">md_llm</span> <span class="o">=</span> <span class="n">MarkItDown</span><span class="p">(</span><span class="n">llm_client</span><span class="o">=</span><span class="n">client</span><span class="p">,</span> <span class="n">llm_model</span><span class="o">=</span><span class="s2">&quot;gpt-4o-mini&quot;</span><span class="p">)</span>
+</pre></div>
+</div>
+</div>
+</div>
+<div class="cell docutils container">
+<div class="cell_input docutils container">
+<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">result</span> <span class="o">=</span> <span class="n">md_llm</span><span class="o">.</span><span class="n">convert</span><span class="p">(</span><span class="s2">&quot;../data/input/forecast.png&quot;</span><span class="p">)</span>
+</pre></div>
+</div>
+</div>
+</div>
+<p>Here’s the description we obtain from the image of our input document.</p>
+<div class="cell docutils container">
+<div class="cell_input docutils container">
+<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">display</span><span class="p">(</span><span class="n">Markdown</span><span class="p">(</span><span class="n">result</span><span class="o">.</span><span class="n">text_content</span><span class="p">))</span>
+</pre></div>
+</div>
+</div>
+<div class="cell_output docutils container">
+<h1 class="rubric" id="description">Description:</h1>
+<p><strong>Markets in Review: Economic Forecasts and Asset Class Weightings (as of 12/13/2024)</strong></p>
+<p>This detailed market overview presents key performance metrics and economic forecasts as of December 13, 2024.</p>
+<p><strong>Equities Overview:</strong></p>
+<ul class="simple">
+<li><p><strong>Total Returns:</strong> Highlights returns for major indices such as the DJIA (18.4% YTD), NASDAQ (33.7% YTD), and S&amp;P 500 (28.6% YTD), showcasing strong performance across the board.</p></li>
+<li><p><strong>Forecasts:</strong> Economic indicators reveal a projected real global GDP growth of 3.1%, with inflation rates expected to stabilize around 2.2% in 2025. Unemployment rates are anticipated to remain low at 4.4%.</p></li>
+</ul>
+<p><strong>Fixed Income:</strong></p>
+<ul class="simple">
+<li><p>Focuses on various segments, including Corporate &amp; Government bonds, which offer an annualized return of 4.66% and indicate shifting trends in interest rates over 2-Year (4.25%) and 10-Year (4.03%) bonds.</p></li>
+</ul>
+<p><strong>Commodities &amp; Currencies:</strong></p>
+<ul class="simple">
+<li><p>Commodities such as crude oil and gold show varied performance, with oil increasing by 4.8% and gold prices sitting at $2,648.23 per ounce.</p></li>
+<li><p>Currency metrics highlight the Euro and USD trends over the past year.</p></li>
+</ul>
+<p><strong>S&amp;P Sector Returns:</strong></p>
+<ul class="simple">
+<li><p>A quick reference for sector performance indicates a significant 2.5% return in Communication Services, while other sectors like Consumer Staples and Materials display minor fluctuations.</p></li>
+</ul>
+<p><strong>CIO Asset Class Weightings:</strong></p>
+<ul class="simple">
+<li><p>Emphasizes strategic asset allocation recommendations which are crucial for an investor’s portfolio. Underweight positions in U.S. Small Cap Growth and International Developed contrast with overweight positions in certain sectors such as Utilities and Financials, signaling tactical shifts based on ongoing economic assessments.</p></li>
+</ul>
+<p><strong>Note:</strong> This summary is sourced from BofA Global Research and aims to provide a comprehensive view of current market conditions and forecasts to assist investors in making informed decisions.</p>
+</div>
+</div>
+<hr class="docutils" />
+<p>Overall, the description is somewhat accurate but contains a few inaccuracies including:</p>
+<ul class="simple">
+<li><p>For the sector weightings, the description states there are “underweight positions in U.S. Small Cap Growth” but looking at the Asset Class Weightings chart, U.S. Small Cap Growth actually shows an overweight position (green circle).</p></li>
+<li><p>The description mentions “overweight positions in certain sectors such as Utilities and Financials” but looking at the CIO Equity Sector Views, both these sectors show neutral positions, not overweight positions.</p></li>
+<li><p>For fixed income, the description cites a “10-Year (4.03%)” yield, but the image shows the 30-Year Yield at 4.03%, while the 10-Year Yield is actually 4.40%.</p></li>
+</ul>
+<p>Arguably, the description’s inaccuracies could be a consequence of the underlying LLM model’s inability to process the image. Further research is needed to determine if this is the case.</p>
+</section>
+</section>
+<section id="retrieval-augmented-generation">
+<h2><a class="toc-backref" href="#id213" role="doc-backlink"><span class="section-number">5.3. </span>Retrieval-Augmented Generation</a><a class="headerlink" href="#retrieval-augmented-generation" title="Permalink to this heading">¶</a></h2>
+<p>RAG is a technique that allows LLMs to retrieve information from a knowledge base to answer questions. It is a popular technique for building LLM applications that require knowledge-intensive tasks <span id="id8">[<a class="reference internal" href="#id99" title="Patrick Lewis, Ethan Perez, Aleksandra Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau Yih, Tim Rocktäschel, Sebastian Riedel, and Douwe Kiela. Retrieval-augmented generation for knowledge-intensive nlp tasks. 2021. URL: https://arxiv.org/abs/2005.11401, arXiv:2005.11401.">Lewis <em>et al.</em>, 2021</a>]</span>.</p>
+<p>RAG utilizes a retrieval system to fetch external knowledge and augment the LLM. It has proved effective in mitigating hallucinations of LLMs <span id="id9">[<a class="reference internal" href="#id148" title="Shiyu Ni, Keping Bi, Jiafeng Guo, and Xueqi Cheng. When do LLMs need retrieval augmentation? mitigating LLMs' overconfidence helps retrieval augmentation. In Lun-Wei Ku, Andre Martins, and Vivek Srikumar, editors, Findings of the Association for Computational Linguistics: ACL 2024, 11375–11388. Bangkok, Thailand, August 2024. Association for Computational Linguistics. URL: https://aclanthology.org/2024.findings-acl.675, doi:10.18653/v1/2024.findings-acl.675.">Ni <em>et al.</em>, 2024</a>, <a class="reference internal" href="#id142" title="Yujia Zhou, Zheng Liu, Jiajie Jin, Jian-Yun Nie, and Zhicheng Dou. Metacognitive retrieval-augmented large language models. In Proceedings of the ACM Web Conference 2024, WWW '24, 1453-1463. New York, NY, USA, 2024. Association for Computing Machinery. URL: https://doi.org/10.1145/3589334.3645481, doi:10.1145/3589334.3645481.">Zhou <em>et al.</em>, 2024</a>]</span>.</p>
+</section>
+<section id="case-studies">
+<h2><a class="toc-backref" href="#id214" role="doc-backlink"><span class="section-number">5.4. </span>Case Studies</a><a class="headerlink" href="#case-studies" title="Permalink to this heading">¶</a></h2>
+<p>This section presents three case studies that demonstrate practical solutions to common LLM limitations:</p>
+<p>First, Content Chunking with Contextual Linking showcases how intelligent chunking strategies can overcome both context window and output token limitations. This case study illustrates techniques for breaking down and reassembling content while maintaining coherence, enabling the generation of high-quality long-form outputs despite model constraints.</p>
+<p>Second, a Retrieval Augmented Generation case study addresses the challenge of stale or outdated model knowledge. By implementing semantic search over a GitHub repository, this example demonstrates how to augment LLM responses with current, accurate information - allowing users to query and receive up-to-date answers about code repository contents.</p>
+<p>Third, the final case study builds a Quiz generator with citations. This case study explores some additional input management techniques that become particularly useful when long context window is available. This includes implementing prompt caching for efficiency and adding citations to enhance response accuracy and verifiability. These approaches show how to maximize the benefits of larger context models while maintaining response quality.</p>
+<section id="case-study-i-content-chunking-with-contextual-linking">
+<h3><a class="toc-backref" href="#id215" role="doc-backlink"><span class="section-number">5.4.1. </span>Case Study I: Content Chunking with Contextual Linking</a><a class="headerlink" href="#case-study-i-content-chunking-with-contextual-linking" title="Permalink to this heading">¶</a></h3>
+<p>Content chunking with contextual linking is a technique to break down long-form content into smaller, manageable chunks while keeping chunk-specific context. This approach tackles three problems:</p>
+<ol class="arabic simple">
+<li><p>The LLM’s inability to process long inputs to do context-size limits</p></li>
+<li><p>The LLM’s inability to generate long-form content due to the <code class="docutils literal notranslate"><span class="pre">max_output_tokens</span></code> limitation.</p></li>
+<li><p>The LLM’s inability to maintain coherence and context when generating responses per chunks</p></li>
+</ol>
+<p>Here, we exemplify this technique by following these steps:</p>
+<ol class="arabic simple">
+<li><p><strong>Chunking the Content</strong>: The input content is split into smaller chunks. This allows the LLM to process each chunk individually, focusing on generating a complete and detailed response for that specific section of the input.</p></li>
+<li><p><strong>Maintaining Context</strong>: Each chunk is linked with contextual information from the previous chunks. This helps in maintaining the flow and coherence of the content across multiple chunks.</p></li>
+<li><p><strong>Generating Linked Prompts</strong>: For each chunk, a prompt is generated that includes the chunk’s content and its context. This prompt is then used to generate the output for that chunk.</p></li>
+<li><p><strong>Combining the Outputs</strong>: The outputs of all chunks are combined to form the final long-form content.</p></li>
+</ol>
+<p>Let’s examine an example implementation of this technique.</p>
+<section id="generating-long-form-content">
+<h4><a class="toc-backref" href="#id216" role="doc-backlink"><span class="section-number">5.4.1.1. </span>Generating long-form content</a><a class="headerlink" href="#generating-long-form-content" title="Permalink to this heading">¶</a></h4>
+<ul class="simple">
+<li><p>Goal: Generate a long-form report analyzing a company’s financial statement.</p></li>
+<li><p>Input: A company’s 10K SEC filing.</p></li>
+</ul>
+<figure class="align-center" id="content-chunking-with-contextual-linking">
+<a class="reference internal image-reference" href="../_images/diagram1.png"><img alt="Content Chunking with Contextual Linking" src="../_images/diagram1.png" style="width: 819.0px; height: 1725.0px;" /></a>
+<figcaption>
+<p><span class="caption-number">Fig. 5.6 </span><span class="caption-text">Content Chunking with Contextual Linking Schematic Representation.</span><a class="headerlink" href="#content-chunking-with-contextual-linking" title="Permalink to this image">¶</a></p>
+</figcaption>
+</figure>
+<p>The diagram in <a class="reference internal" href="#content-chunking-with-contextual-linking"><span class="std std-numref">Fig. 5.6</span></a> illustrates the process we will follow for handling long-form content generation with Large Language Models through “Content Chunking with Contextual Linking.” It shows how input content is first split into manageable chunks using a chunking function (e.g. <code class="docutils literal notranslate"><span class="pre">CharacterTextSplitter</span></code> with <code class="docutils literal notranslate"><span class="pre">tiktoken</span></code> tokenizer), then each chunk is processed sequentially while maintaining context from previous chunks. For each chunk, the system updates the context, generates a dynamic prompt with specific parameters, makes a call to the LLM chain, and stores the response. After all chunks are processed, the individual responses are combined with newlines to create the final report, effectively working around the token limit constraints of LLMs while maintaining coherence across the generated content.</p>
+<p><strong>Step 1: Chunking the Content</strong></p>
+<p>There are different methods for chunking, and each of them might be appropriate for different situations. However, we can broadly group chunking strategies in two types:</p>
+<ul>
+<li><p><strong>Fixed-size Chunking</strong>: This is the most common and straightforward approach to chunking. We simply decide the number of tokens in our chunk and, optionally, whether there should be any overlap between them. In general, we will want to keep some overlap between chunks to make sure that the semantic context doesn’t get lost between chunks. Fixed-sized chunking may be a reasonable path in many common cases. Compared to other forms of chunking, fixed-sized chunking is computationally cheap and simple to use since it doesn’t require the use of any specialied techniques or libraries.</p></li>
+<li><p><strong>Content-aware Chunking</strong>: These are a set of methods for taking advantage of the nature of the content we’re chunking and applying more sophisticated chunking to it. Examples include:</p>
+<ul class="simple">
+<li><p><strong>Sentence Splitting</strong>: Many models are optimized for embedding sentence-level content. Naturally, we would use sentence chunking, and there are several approaches and tools available to do this, including naive splitting (e.g. splitting on periods), NLTK, and spaCy.</p></li>
+<li><p><strong>Recursive Chunking</strong>: Recursive chunking divides the input text into smaller chunks in a hierarchical and iterative manner using a set of separators.</p></li>
+<li><p><strong>Semantic Chunking</strong>: This is a class of methods that leverages embeddings to extract the semantic meaning present in your data, creating chunks that are made up of sentences that talk about the same theme or topic.</p></li>
+</ul>
+<p>Here, we will utilize <code class="docutils literal notranslate"><span class="pre">langchain</span></code> for a content-aware sentence-splitting strategy for chunking. Langchain offers several text splitters <span id="id10">[<a class="reference internal" href="#id51" title="LangChain. Text splitters - langchain documentation. https://python.langchain.com/docs/how_to/#text-splitters, 2024. Accessed: 12/07/2024.">LangChain, 2024</a>]</span> such as JSON-, Markdown- and HTML-based or split by token. We will use the <code class="docutils literal notranslate"><span class="pre">CharacterTextSplitter</span></code> with <code class="docutils literal notranslate"><span class="pre">tiktoken</span></code> as our tokenizer to count the number of tokens per chunk which we can use to ensure that we do not surpass the input token limit of our model.</p>
+</li>
+</ul>
+<div class="cell docutils container">
+<div class="cell_input docutils container">
+<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="k">def</span> <span class="nf">get_chunks</span><span class="p">(</span><span class="n">text</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">chunk_size</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">chunk_overlap</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">list</span><span class="p">:</span>
+<span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">    Split input text into chunks of specified size with specified overlap.</span>
+
+<span class="sd">    Args:</span>
+<span class="sd">        text (str): The input text to be chunked.</span>
+<span class="sd">        chunk_size (int): The maximum size of each chunk in tokens.</span>
+<span class="sd">        chunk_overlap (int): The number of tokens to overlap between chunks.</span>
+
+<span class="sd">    Returns:</span>
+<span class="sd">        list: A list of text chunks.</span>
+<span class="sd">    &quot;&quot;&quot;</span>
+    <span class="kn">from</span> <span class="nn">langchain_text_splitters</span> <span class="kn">import</span> <span class="n">CharacterTextSplitter</span>
+
+    <span class="n">text_splitter</span> <span class="o">=</span> <span class="n">CharacterTextSplitter</span><span class="o">.</span><span class="n">from_tiktoken_encoder</span><span class="p">(</span><span class="n">chunk_size</span><span class="o">=</span><span class="n">chunk_size</span><span class="p">,</span> <span class="n">chunk_overlap</span><span class="o">=</span><span class="n">chunk_overlap</span><span class="p">)</span>
+    <span class="k">return</span> <span class="n">text_splitter</span><span class="o">.</span><span class="n">split_text</span><span class="p">(</span><span class="n">text</span><span class="p">)</span>
+</pre></div>
+</div>
+</div>
+</div>
+<p><strong>Step 2: Writing the Base Prompt Template</strong></p>
+<p>We will write a base prompt template which will serve as a foundational structure for all chunks, ensuring consistency in the instructions and context provided to the language model. The template includes the following parameters:</p>
+<ul class="simple">
+<li><p><code class="docutils literal notranslate"><span class="pre">role</span></code>: Defines the role or persona the model should assume.</p></li>
+<li><p><code class="docutils literal notranslate"><span class="pre">context</span></code>: Provides the background information or context for the task.</p></li>
+<li><p><code class="docutils literal notranslate"><span class="pre">instruction</span></code>: Specifies the task or action the model needs to perform.</p></li>
+<li><p><code class="docutils literal notranslate"><span class="pre">input_text</span></code>: Contains the actual text input that the model will process.</p></li>
+<li><p><code class="docutils literal notranslate"><span class="pre">requirements</span></code>: Lists any specific requirements or constraints for the output.</p></li>
+</ul>
+<div class="cell docutils container">
+<div class="cell_input docutils container">
+<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">langchain_core.prompts</span> <span class="kn">import</span> <span class="n">PromptTemplate</span>
+<span class="k">def</span> <span class="nf">get_base_prompt_template</span><span class="p">()</span> <span class="o">-&gt;</span> <span class="nb">str</span><span class="p">:</span>
+    
+    <span class="n">base_prompt</span> <span class="o">=</span> <span class="s2">&quot;&quot;&quot;</span>
+<span class="s2">    ROLE: </span><span class="si">{role}</span>
+<span class="s2">    CONTEXT: </span><span class="si">{context}</span>
+<span class="s2">    INSTRUCTION: </span><span class="si">{instruction}</span>
+<span class="s2">    INPUT: </span><span class="si">{input}</span>
+<span class="s2">    REQUIREMENTS: </span><span class="si">{requirements}</span>
+<span class="s2">    &quot;&quot;&quot;</span>
+    
+    <span class="n">prompt</span> <span class="o">=</span> <span class="n">PromptTemplate</span><span class="o">.</span><span class="n">from_template</span><span class="p">(</span><span class="n">base_prompt</span><span class="p">)</span>
+    <span class="k">return</span> <span class="n">prompt</span>
+</pre></div>
+</div>
+</div>
+</div>
+<p>We will write a simple function that returns an <code class="docutils literal notranslate"><span class="pre">LLMChain</span></code> which is a simple <code class="docutils literal notranslate"><span class="pre">langchain</span></code> construct that allows you to chain together a combination of prompt templates, language models and output parsers.</p>
+<div class="cell docutils container">
+<div class="cell_input docutils container">
+<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">langchain_core.output_parsers</span> <span class="kn">import</span> <span class="n">StrOutputParser</span>
+<span class="kn">from</span> <span class="nn">langchain_community.chat_models</span> <span class="kn">import</span> <span class="n">ChatLiteLLM</span>
+
+<span class="k">def</span> <span class="nf">get_llm_chain</span><span class="p">(</span><span class="n">prompt_template</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">model_name</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">temperature</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mi">0</span><span class="p">):</span>
+<span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">    Returns an LLMChain instance using langchain.</span>
+
+<span class="sd">    Args:</span>
+<span class="sd">        prompt_template (str): The prompt template to use.</span>
+<span class="sd">        model_name (str): The name of the model to use.</span>
+<span class="sd">        temperature (float): The temperature setting for the model.</span>
+
+<span class="sd">    Returns:</span>
+<span class="sd">        llm_chain: An instance of the LLMChain.</span>
+<span class="sd">    &quot;&quot;&quot;</span>
+    
+    <span class="kn">from</span> <span class="nn">dotenv</span> <span class="kn">import</span> <span class="n">load_dotenv</span>
+    <span class="kn">import</span> <span class="nn">os</span>
+
+    <span class="c1"># Load environment variables from .env file</span>
+    <span class="n">load_dotenv</span><span class="p">()</span>
+    
+    <span class="n">api_key_label</span> <span class="o">=</span> <span class="n">model_name</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="s2">&quot;/&quot;</span><span class="p">)[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">upper</span><span class="p">()</span> <span class="o">+</span> <span class="s2">&quot;_API_KEY&quot;</span>
+    <span class="n">llm</span> <span class="o">=</span> <span class="n">ChatLiteLLM</span><span class="p">(</span>
+        <span class="n">model</span><span class="o">=</span><span class="n">model_name</span><span class="p">,</span>
+        <span class="n">temperature</span><span class="o">=</span><span class="n">temperature</span><span class="p">,</span>
+        <span class="n">api_key</span><span class="o">=</span><span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="p">[</span><span class="n">api_key_label</span><span class="p">],</span>
+    <span class="p">)</span>
+    <span class="n">llm_chain</span> <span class="o">=</span> <span class="n">prompt_template</span> <span class="o">|</span> <span class="n">llm</span> <span class="o">|</span> <span class="n">StrOutputParser</span><span class="p">()</span>
+    <span class="k">return</span> <span class="n">llm_chain</span>
+</pre></div>
+</div>
+</div>
+</div>
+<p><strong>Step 3: Constructing Dynamic Prompt Parameters</strong></p>
+<p>Now, we will write a function (<code class="docutils literal notranslate"><span class="pre">get_dynamic_prompt_template</span></code>) that constructs prompt parameters dynamically for each chunk.</p>
+<div class="cell docutils container">
+<div class="cell_input docutils container">
+<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">typing</span> <span class="kn">import</span> <span class="n">Dict</span>
+<span class="k">def</span> <span class="nf">get_dynamic_prompt_params</span><span class="p">(</span><span class="n">prompt_params</span><span class="p">:</span> <span class="n">Dict</span><span class="p">,</span> 
+                            <span class="n">part_idx</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> 
+                            <span class="n">total_parts</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span>
+                            <span class="n">chat_context</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span>
+                            <span class="n">chunk</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">str</span><span class="p">:</span>
+<span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">    Construct prompt template dynamically per chunk while maintaining the chat context of the response generation.</span>
+<span class="sd">    </span>
+<span class="sd">    Args:</span>
+<span class="sd">        prompt_params (Dict): Original prompt parameters</span>
+<span class="sd">        part_idx (int): Index of current conversation part</span>
+<span class="sd">        total_parts (int): Total number of conversation parts</span>
+<span class="sd">        chat_context (str): Chat context from previous parts</span>
+<span class="sd">        chunk (str): Current chunk of text to be processed</span>
+<span class="sd">    Returns:</span>
+<span class="sd">        str: Dynamically constructed prompt template with part-specific params</span>
+<span class="sd">    &quot;&quot;&quot;</span>
+    <span class="n">dynamic_prompt_params</span> <span class="o">=</span> <span class="n">prompt_params</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span>
+    <span class="c1"># saves the chat context from previous parts</span>
+    <span class="n">dynamic_prompt_params</span><span class="p">[</span><span class="s2">&quot;context&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">chat_context</span>
+    <span class="c1"># saves the current chunk of text to be processed as input</span>
+    <span class="n">dynamic_prompt_params</span><span class="p">[</span><span class="s2">&quot;input&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">chunk</span>
+    
+    <span class="c1"># Add part-specific instructions</span>
+    <span class="k">if</span> <span class="n">part_idx</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span> <span class="c1"># Introduction part</span>
+        <span class="n">dynamic_prompt_params</span><span class="p">[</span><span class="s2">&quot;instruction&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="sa">f</span><span class="s2">&quot;&quot;&quot;</span>
+<span class="s2">        You are generating the Introduction part of a long report.</span>
+<span class="s2">        Don&#39;t cover any topics yet, just define the scope of the report.</span>
+<span class="s2">        &quot;&quot;&quot;</span>
+    <span class="k">elif</span> <span class="n">part_idx</span> <span class="o">==</span> <span class="n">total_parts</span> <span class="o">-</span> <span class="mi">1</span><span class="p">:</span> <span class="c1"># Conclusion part</span>
+        <span class="n">dynamic_prompt_params</span><span class="p">[</span><span class="s2">&quot;instruction&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="sa">f</span><span class="s2">&quot;&quot;&quot;</span>
+<span class="s2">        You are generating the last part of a long report. </span>
+<span class="s2">        For this part, first discuss the below INPUT. Second, write a &quot;Conclusion&quot; section summarizing the main points discussed given in CONTEXT.</span>
+<span class="s2">        &quot;&quot;&quot;</span>
+    <span class="k">else</span><span class="p">:</span> <span class="c1"># Main analysis part</span>
+        <span class="n">dynamic_prompt_params</span><span class="p">[</span><span class="s2">&quot;instruction&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="sa">f</span><span class="s2">&quot;&quot;&quot;</span>
+<span class="s2">        You are generating part </span><span class="si">{</span><span class="n">part_idx</span><span class="o">+</span><span class="mi">1</span><span class="si">}</span><span class="s2"> of </span><span class="si">{</span><span class="n">total_parts</span><span class="si">}</span><span class="s2"> parts of a long report.</span>
+<span class="s2">        For this part, analyze the below INPUT.</span>
+<span class="s2">        Organize your response in a way that is easy to read and understand either by creating new or merging with previously created structured sections given in CONTEXT.</span>
+<span class="s2">        &quot;&quot;&quot;</span>
+    
+    <span class="k">return</span> <span class="n">dynamic_prompt_params</span>
+</pre></div>
+</div>
+</div>
+</div>
+<p><strong>Step 4: Generating the Report</strong></p>
+<p>Finally, we will write a function that generates the actual report by calling the <code class="docutils literal notranslate"><span class="pre">LLMChain</span></code> with the dynamically updated prompt parameters for each chunk and concatenating the results at the end.</p>
+<div class="cell docutils container">
+<div class="cell_input docutils container">
+<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="k">def</span> <span class="nf">generate_report</span><span class="p">(</span><span class="n">input_content</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">llm_model_name</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> 
+                    <span class="n">role</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">requirements</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span>
+                    <span class="n">chunk_size</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">chunk_overlap</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">str</span><span class="p">:</span>
+    <span class="c1"># stores the parts of the report, each generated by an individual LLM call</span>
+    <span class="n">report_parts</span> <span class="o">=</span> <span class="p">[]</span> 
+    <span class="c1"># split the input content into chunks</span>
+    <span class="n">chunks</span> <span class="o">=</span> <span class="n">get_chunks</span><span class="p">(</span><span class="n">input_content</span><span class="p">,</span> <span class="n">chunk_size</span><span class="p">,</span> <span class="n">chunk_overlap</span><span class="p">)</span>
+    <span class="c1"># initialize the chat context with the input content</span>
+    <span class="n">chat_context</span> <span class="o">=</span> <span class="n">input_content</span>
+    <span class="c1"># number of parts to be generated</span>
+    <span class="n">num_parts</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="n">chunks</span><span class="p">)</span>
+
+    <span class="n">prompt_params</span> <span class="o">=</span> <span class="p">{</span>
+        <span class="s2">&quot;role&quot;</span><span class="p">:</span> <span class="n">role</span><span class="p">,</span> <span class="c1"># user-provided</span>
+        <span class="s2">&quot;context&quot;</span><span class="p">:</span> <span class="s2">&quot;&quot;</span><span class="p">,</span> <span class="c1"># dinamically updated per part</span>
+        <span class="s2">&quot;instruction&quot;</span><span class="p">:</span> <span class="s2">&quot;&quot;</span><span class="p">,</span> <span class="c1"># dynamically updated per part</span>
+        <span class="s2">&quot;input&quot;</span><span class="p">:</span> <span class="s2">&quot;&quot;</span><span class="p">,</span> <span class="c1"># dynamically updated per part</span>
+        <span class="s2">&quot;requirements&quot;</span><span class="p">:</span> <span class="n">requirements</span> <span class="c1">#user-priovided</span>
+    <span class="p">}</span>
+
+    <span class="c1"># get the LLMChain with the base prompt template</span>
+    <span class="n">llm_chain</span> <span class="o">=</span> <span class="n">get_llm_chain</span><span class="p">(</span><span class="n">get_base_prompt_template</span><span class="p">(),</span> 
+                                 <span class="n">llm_model_name</span><span class="p">)</span>
+
+    <span class="c1"># dynamically update prompt_params per part</span>
+    <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Generating </span><span class="si">{</span><span class="n">num_parts</span><span class="si">}</span><span class="s2"> report parts&quot;</span><span class="p">)</span>
+    <span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">chunk</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">chunks</span><span class="p">):</span>
+        <span class="n">dynamic_prompt_params</span> <span class="o">=</span> <span class="n">get_dynamic_prompt_params</span><span class="p">(</span>
+            <span class="n">prompt_params</span><span class="p">,</span>
+            <span class="n">part_idx</span><span class="o">=</span><span class="n">i</span><span class="p">,</span>
+            <span class="n">total_parts</span><span class="o">=</span><span class="n">num_parts</span><span class="p">,</span>
+            <span class="n">chat_context</span><span class="o">=</span><span class="n">chat_context</span><span class="p">,</span>
+            <span class="n">chunk</span><span class="o">=</span><span class="n">chunk</span>
+        <span class="p">)</span>
+        
+        <span class="c1"># invoke the LLMChain with the dynamically updated prompt parameters</span>
+        <span class="n">response</span> <span class="o">=</span> <span class="n">llm_chain</span><span class="o">.</span><span class="n">invoke</span><span class="p">(</span><span class="n">dynamic_prompt_params</span><span class="p">)</span>
+
+        <span class="c1"># update the chat context with the cummulative response</span>
+        <span class="k">if</span> <span class="n">i</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
+            <span class="n">chat_context</span> <span class="o">=</span> <span class="n">response</span>
+        <span class="k">else</span><span class="p">:</span>
+            <span class="n">chat_context</span> <span class="o">=</span> <span class="n">chat_context</span> <span class="o">+</span> <span class="n">response</span>
+            
+        <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Generated part </span><span class="si">{</span><span class="n">i</span><span class="o">+</span><span class="mi">1</span><span class="si">}</span><span class="s2">/</span><span class="si">{</span><span class="n">num_parts</span><span class="si">}</span><span class="s2">.&quot;</span><span class="p">)</span>
+        <span class="n">report_parts</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">response</span><span class="p">)</span>
+
+    <span class="n">report</span> <span class="o">=</span> <span class="s2">&quot;</span><span class="se">\n</span><span class="s2">&quot;</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">report_parts</span><span class="p">)</span>
+    <span class="k">return</span> <span class="n">report</span>
+</pre></div>
+</div>
+</div>
+</div>
+<p><strong>Example Usage</strong></p>
+<div class="cell docutils container">
+<div class="cell_input docutils container">
+<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="c1"># Load the text from sample 10K SEC filing</span>
+<span class="k">with</span> <span class="nb">open</span><span class="p">(</span><span class="s1">&#39;../data/apple.txt&#39;</span><span class="p">,</span> <span class="s1">&#39;r&#39;</span><span class="p">)</span> <span class="k">as</span> <span class="n">file</span><span class="p">:</span>
+    <span class="n">text</span> <span class="o">=</span> <span class="n">file</span><span class="o">.</span><span class="n">read</span><span class="p">()</span>
+</pre></div>
+</div>
+</div>
+</div>
+<div class="cell docutils container">
+<div class="cell_input docutils container">
+<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="c1"># Define the chunk and chunk overlap size</span>
+<span class="n">MAX_CHUNK_SIZE</span> <span class="o">=</span> <span class="mi">10000</span>
+<span class="n">MAX_CHUNK_OVERLAP</span> <span class="o">=</span> <span class="mi">0</span>
+</pre></div>
+</div>
+</div>
+</div>
+<div class="cell docutils container">
+<div class="cell_input docutils container">
+<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">report</span> <span class="o">=</span> <span class="n">generate_report</span><span class="p">(</span><span class="n">text</span><span class="p">,</span> <span class="n">llm_model_name</span><span class="o">=</span><span class="s2">&quot;gemini/gemini-1.5-flash-latest&quot;</span><span class="p">,</span> 
+                           <span class="n">role</span><span class="o">=</span><span class="s2">&quot;Financial Analyst&quot;</span><span class="p">,</span> 
+                           <span class="n">requirements</span><span class="o">=</span><span class="s2">&quot;The report should be in a readable, structured format, easy to understand and follow. Focus on finding risk factors and market moving insights.&quot;</span><span class="p">,</span>
+                           <span class="n">chunk_size</span><span class="o">=</span><span class="n">MAX_CHUNK_SIZE</span><span class="p">,</span> 
+                           <span class="n">chunk_overlap</span><span class="o">=</span><span class="n">MAX_CHUNK_OVERLAP</span><span class="p">)</span>
+</pre></div>
+</div>
+</div>
+</div>
+<div class="cell docutils container">
+<div class="cell_input docutils container">
+<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="c1"># Save the generated report to a local file</span>
+<span class="k">with</span> <span class="nb">open</span><span class="p">(</span><span class="s1">&#39;data/apple_report.txt&#39;</span><span class="p">,</span> <span class="s1">&#39;w&#39;</span><span class="p">)</span> <span class="k">as</span> <span class="n">file</span><span class="p">:</span>
+    <span class="n">file</span><span class="o">.</span><span class="n">write</span><span class="p">(</span><span class="n">report</span><span class="p">)</span>
+</pre></div>
+</div>
+</div>
+</div>
+<div class="cell docutils container">
+<div class="cell_input docutils container">
+<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="c1"># Read and display the generated report</span>
+<span class="k">with</span> <span class="nb">open</span><span class="p">(</span><span class="s1">&#39;../data/apple_report.txt&#39;</span><span class="p">,</span> <span class="s1">&#39;r&#39;</span><span class="p">)</span> <span class="k">as</span> <span class="n">file</span><span class="p">:</span>
+    <span class="n">report_content</span> <span class="o">=</span> <span class="n">file</span><span class="o">.</span><span class="n">read</span><span class="p">()</span>
+    
+<span class="kn">from</span> <span class="nn">IPython.display</span> <span class="kn">import</span> <span class="n">Markdown</span>
+
+<span class="c1"># Display first and last 10% of the report content</span>
+<span class="n">report_lines</span> <span class="o">=</span> <span class="n">report_content</span><span class="o">.</span><span class="n">splitlines</span><span class="p">()</span>
+<span class="n">total_lines</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="n">report_lines</span><span class="p">)</span>
+<span class="n">quarter_lines</span> <span class="o">=</span> <span class="n">total_lines</span> <span class="o">//</span> <span class="mi">10</span>
+
+<span class="n">top_portion</span> <span class="o">=</span> <span class="s1">&#39;</span><span class="se">\n</span><span class="s1">&#39;</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">report_lines</span><span class="p">[:</span><span class="n">quarter_lines</span><span class="p">])</span>
+<span class="n">bottom_portion</span> <span class="o">=</span> <span class="s1">&#39;</span><span class="se">\n</span><span class="s1">&#39;</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">report_lines</span><span class="p">[</span><span class="o">-</span><span class="n">quarter_lines</span><span class="p">:])</span>
+
+<span class="n">display</span><span class="p">(</span><span class="n">Markdown</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;</span><span class="si">{</span><span class="n">top_portion</span><span class="si">}</span><span class="se">\n\n</span><span class="s2"> (...) </span><span class="se">\n\n</span><span class="s2"> </span><span class="si">{</span><span class="n">bottom_portion</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">))</span>
+</pre></div>
+</div>
+</div>
+<div class="cell_output docutils container">
+<p><strong>Introduction</strong></p>
+<p>This report provides a comprehensive analysis of Apple Inc.’s financial performance and position for the fiscal year ended September 28, 2024, as disclosed in its Form 10-K filing with the United States Securities and Exchange Commission.  The analysis will focus on identifying key risk factors impacting Apple’s business, evaluating its financial health, and uncovering market-moving insights derived from the provided data.  The report will delve into Apple’s various segments, product lines, and services, examining their performance and contributions to overall financial results.  Specific attention will be paid to identifying trends, potential challenges, and opportunities for future growth.  The analysis will also consider the broader macroeconomic environment and its influence on Apple’s operations and financial outlook.  Finally, the report will incorporate relevant information from Apple’s definitive proxy statement for its 2025 annual meeting of shareholders, as incorporated by reference in the Form 10-K.</p>
+<p><strong>PART 2: Key Risk Factors and Market-Moving Insights</strong></p>
+<p>This section analyzes key risk factors disclosed in Apple Inc.’s 2024 Form 10-K, focusing on their potential impact on financial performance and identifying potential market-moving insights.  The analysis is structured around the major risk categories identified in the filing.</p>
+<p><strong>2.1 Dependence on Third-Party Developers:</strong></p>
+<p>Apple’s success is heavily reliant on the continued support and innovation of third-party software developers.  The Form 10-K highlights several critical aspects of this dependence:</p>
+<ul class="simple">
+<li><p><strong>Market Share Vulnerability:</strong> Apple’s relatively smaller market share in smartphones, personal computers, and tablets compared to competitors (Android, Windows, gaming consoles) could discourage developers from prioritizing Apple’s platform, leading to fewer high-quality apps and potentially impacting customer purchasing decisions.  This is a significant risk, especially given the rapid pace of technological change.  A decline in app availability or quality could negatively impact sales and market share.  <strong>Market-moving insight:</strong>  Monitoring developer activity and app quality across competing platforms is crucial for assessing this risk.  Any significant shift in developer focus away from iOS could be a negative market signal.</p></li>
+<li><p><strong>App Store Dynamics:</strong> While Apple allows developers to retain most App Store revenue, its commission structure and recent changes (e.g., complying with the Digital Markets Act (DMA) in the EU) introduce uncertainty.  Changes to the App Store’s policies or fee structures could materially affect Apple’s revenue and profitability.  <strong>Market-moving insight:</strong>  Closely monitoring regulatory developments (especially concerning the DMA) and their impact on App Store revenue is essential.  Any significant changes to Apple’s App Store policies or revenue streams could trigger market reactions.</p></li>
+<li><p><strong>Content Acquisition and Creation:</strong> Apple’s reliance on third-party digital content providers for its services introduces risks related to licensing agreements, competition, and pricing.  The cost of producing its own digital content is also increasing due to competition for talent and subscribers.  Failure to secure or create appealing content could negatively impact user engagement and revenue.  <strong>Market-moving insight:</strong>  Analyzing the success of Apple’s original content initiatives and the renewal rates of third-party content agreements will provide insights into this risk.</p></li>
+</ul>
+<p><strong>2.2 Operational Risks:</strong></p>
+<p>(…)</p>
+<p>The reconciliation of segment operating income to consolidated operating income reveals that research and development (R&amp;D) and other corporate expenses significantly impact overall profitability.  While increased R&amp;D is generally positive, it reduces short-term profits.  The geographical breakdown of net sales and long-lived assets further emphasizes the concentration of Apple’s business in the U.S. and China.  <strong>Market-moving insight:</strong>  Continued weakness in the Greater China market, sustained flat iPhone sales, or any significant changes in R&amp;D spending should be closely monitored for their potential impact on Apple’s financial performance and investor sentiment.</p>
+<p><strong>5.4 Auditor’s Report and Internal Controls:</strong></p>
+<p>The auditor’s report expresses an unqualified opinion on Apple’s financial statements and internal control over financial reporting.  However, it identifies uncertain tax positions as a critical audit matter.  The significant amount of unrecognized tax benefits ($22.0 billion) and the complexity involved in evaluating these positions highlight a substantial risk.  Management’s assessment of these positions involves significant judgment and relies on interpretations of complex tax laws.  Apple’s management also asserts that its disclosure controls and procedures are effective.  <strong>Market-moving insight:</strong>  Any changes in tax laws, unfavorable rulings on uncertain tax positions, or weaknesses in internal controls could materially affect Apple’s financial results and investor confidence.</p>
+<p><strong>Conclusion</strong></p>
+<p>This report provides a comprehensive analysis of Apple Inc.’s financial performance and position for fiscal year 2024.  While Apple maintains a strong financial position with substantial cash reserves and a robust capital return program, several key risk factors could significantly impact its future performance.  These risks include:</p>
+<ul class="simple">
+<li><p><strong>Dependence on third-party developers:</strong>  A shift in developer focus away from iOS or changes to the App Store’s policies could negatively impact Apple’s revenue and profitability.</p></li>
+<li><p><strong>Operational risks:</strong>  Employee retention challenges, reseller dependence, and cybersecurity threats pose significant operational risks.</p></li>
+<li><p><strong>Legal and regulatory risks:</strong>  Ongoing antitrust litigation, the Digital Markets Act (DMA) compliance, and data privacy regulations introduce substantial legal and regulatory uncertainties.</p></li>
+<li><p><strong>Financial risks:</strong>  Volatility in sales and profit margins, foreign exchange rate fluctuations, credit risk, and tax risks could impact Apple’s financial performance.</p></li>
+<li><p><strong>Supply chain concentration:</strong>  Apple’s reliance on a concentrated network of outsourcing partners, primarily located in a few Asian countries, and dependence on single or limited sources for certain custom components, exposes the company to significant supply chain risks.</p></li>
+<li><p><strong>Uncertain tax positions:</strong>  The significant amount of unrecognized tax benefits represents a substantial uncertainty that could materially affect Apple’s financial results.</p></li>
+</ul>
+<p>Despite these risks, Apple’s strong liquidity position, continued growth in its Services segment, and robust capital return program provide a degree of resilience.  However, investors and analysts should closely monitor the market-moving insights identified throughout this report, including developer activity, regulatory developments, regional economic conditions, supply chain stability, and the resolution of uncertain tax positions, to assess their potential impact on Apple’s future performance and valuation.  The significant short-term obligations, while manageable given Apple’s cash position, highlight the need for continued financial discipline and effective risk management.  A deeper, more granular analysis of the financial statements and notes is recommended for a more complete assessment.</p>
+</div>
+</div>
+</section>
+<hr class="docutils" />
+<section id="discussion">
+<h4><a class="toc-backref" href="#id217" role="doc-backlink"><span class="section-number">5.4.1.2. </span>Discussion</a><a class="headerlink" href="#discussion" title="Permalink to this heading">¶</a></h4>
+<p>Results from the generated report present a few interesting aspects:</p>
+<ul class="simple">
+<li><p><strong>Coherence</strong>: The generated report demonstrates an apparent level of coherence. The sections are logically structured, and the flow of information is smooth. Each part of the report builds upon the previous sections, providing a comprehensive analysis of Apple Inc.’s financial performance and key risk factors. The use of headings and subheadings helps in maintaining clarity and organization throughout the document.</p></li>
+<li><p><strong>Adherence to Instructions</strong>: The LLM followed the provided instructions effectively. The report is in a readable, structured format, and it focuses on identifying risk factors and market-moving insights as requested. The analysis is detailed and covers various aspects of Apple’s financial performance, including revenue segmentation, profitability, liquidity, and capital resources. The inclusion of market-moving insights adds value to the report, aligning with the specified requirements.</p></li>
+</ul>
+<p>Despite the seemingly good quality of the results, there are some limitations to consider:</p>
+<ul class="simple">
+<li><p><strong>Depth of Analysis</strong>: While the report covers a wide range of topics, the depth of analysis in certain sections may not be as comprehensive as a human expert’s evaluation. Some nuances and contextual factors might be overlooked by the LLM. Splitting the report into multiple parts helps in mitigating this issue.</p></li>
+<li><p><strong>Chunking Strategy</strong>: The current approach splits the text into chunks based on size, which ensures that each chunk fits within the model’s token limit. However, this method may disrupt the logical flow of the document, as sections of interest might be split across multiple chunks. An alternative approach could be “structured” chunking, where the text is divided based on meaningful sections or topics. This would preserve the coherence of each section, making it easier to follow and understand. Implementing structured chunking requires additional preprocessing to identify and segment the text appropriately, but it can significantly enhance the readability and logical flow of the generated report.</p></li>
+</ul>
+<p>Here, we implemented a simple strategy to improve the coherence in output generation given a multi-part chunked input. Many other strategies are possible. One related technique worth mentioning is Anthropic’s Contextual Retrieval <span id="id11">[<a class="reference internal" href="#id144" title="Anthropic. Introducing contextual retrieval. 09 2024. URL: https://www.anthropic.com/news/contextual-retrieval.">Anthropic, 2024</a>]</span>. The approach, as shown in <a class="reference internal" href="#anth-contextual"><span class="std std-numref">Fig. 5.7</span></a>, employs an LLM itself to generate relevant context per chunk before passing these two pieces of information together to the LLM. This process was proposed in the context of RAGs to enhance its retrieval capabilities but can be applied more generally to improve output generation.</p>
+<figure class="align-center" id="anth-contextual">
+<a class="reference internal image-reference" href="../_images/anth_contextual.png"><img alt="Anthropic Contextual Linking" src="../_images/anth_contextual.png" style="width: 545.5px; height: 359.0px;" /></a>
+<figcaption>
+<p><span class="caption-number">Fig. 5.7 </span><span class="caption-text">Anthropic Contextual Linking <span id="id12">[<a class="reference internal" href="#id144" title="Anthropic. Introducing contextual retrieval. 09 2024. URL: https://www.anthropic.com/news/contextual-retrieval.">Anthropic, 2024</a>]</span>.</span><a class="headerlink" href="#anth-contextual" title="Permalink to this image">¶</a></p>
+</figcaption>
+</figure>
+</section>
+</section>
+<section id="case-study-ii-github-rag">
+<h3><a class="toc-backref" href="#id218" role="doc-backlink"><span class="section-number">5.4.2. </span>Case Study II: Github RAG</a><a class="headerlink" href="#case-study-ii-github-rag" title="Permalink to this heading">¶</a></h3>
+</section>
+<section id="case-study-iii-quiz-generation-with-citations">
+<h3><a class="toc-backref" href="#id219" role="doc-backlink"><span class="section-number">5.4.3. </span>Case Study III: Quiz Generation with Citations</a><a class="headerlink" href="#case-study-iii-quiz-generation-with-citations" title="Permalink to this heading">¶</a></h3>
+<p>In this case study, we will build a Quiz generator with citations that explores additional input management techniques particularly useful with long context windows. The implementation includes prompt caching for efficiency and citation tracking to enhance accuracy and verifiability. We will use Gemini 1.5 Pro as our LLM model, which has a context window of 2M tokens.</p>
+<section id="use-case">
+<h4><a class="toc-backref" href="#id220" role="doc-backlink"><span class="section-number">5.4.3.1. </span>Use Case</a><a class="headerlink" href="#use-case" title="Permalink to this heading">¶</a></h4>
+<p>Let’s assume you are a Harvard student enrolled in GOV 1039 “The Birth of Modern Democracy” (see <a class="reference internal" href="#harvard-class"><span class="std std-numref">Fig. 5.8</span></a>), you face a daunting reading list for next Tuesday’s class on Rights. The readings include foundational documents like the Magna Carta, Declaration of Independence, and US Bill of Rights, each with specific sections to analyze.</p>
+<figure class="align-center" id="harvard-class">
+<a class="reference internal image-reference" href="../_images/harvard.png"><img alt="Harvard Class" src="../_images/harvard.png" style="width: 691.0px; height: 435.0px;" /></a>
+<figcaption>
+<p><span class="caption-number">Fig. 5.8 </span><span class="caption-text">Harvard’s Democratic Theory Class</span><a class="headerlink" href="#harvard-class" title="Permalink to this image">¶</a></p>
+</figcaption>
+</figure>
+<p>Instead of trudging through these dense historical texts sequentially, we would like to:</p>
+<ul class="simple">
+<li><p>Extract key insights and connections between these documents, conversationally.</p></li>
+<li><p>Engage with the material through a quiz format.</p></li>
+<li><p>Add citations to help with verifying answers.</p></li>
+</ul>
+</section>
+<section id="implementation">
+<h4><a class="toc-backref" href="#id221" role="doc-backlink"><span class="section-number">5.4.3.2. </span>Implementation</a><a class="headerlink" href="#implementation" title="Permalink to this heading">¶</a></h4>
+<p>The full implementation is available at Book’s <a class="reference external" href="https://github.com/souzatharsis/tamingLLMs/tamingllms/notebooks/src/gemini_duo.py">Github repository</a>. Here, we will cover the most relevant parts of the implementation.</p>
+<p><strong>Client Class</strong></p>
+<p>First, we will define the <code class="docutils literal notranslate"><span class="pre">Client</span></code> class which will provide the key interface users will interact with. It has the following summarized interface:</p>
+<ul class="simple">
+<li><p>Initialization:</p>
+<ul>
+<li><p><code class="docutils literal notranslate"><span class="pre">__init__(knowledge_base:</span> <span class="pre">List[str]</span> <span class="pre">=</span> <span class="pre">[])</span></code>: Initialize with optional list of URLs as knowledge base</p></li>
+</ul>
+</li>
+<li><p>Core Methods:</p>
+<ul>
+<li><p><code class="docutils literal notranslate"><span class="pre">add_knowledge_base(urls:</span> <span class="pre">List[str])</span> <span class="pre">-&gt;</span> <span class="pre">None</span></code>: Add URLs to the knowledge base</p></li>
+<li><p><code class="docutils literal notranslate"><span class="pre">add(urls:</span> <span class="pre">List[str])</span> <span class="pre">-&gt;</span> <span class="pre">None</span></code>: Extract content from URLs and add to conversation input</p></li>
+<li><p><code class="docutils literal notranslate"><span class="pre">msg(msg:</span> <span class="pre">str</span> <span class="pre">=</span> <span class="pre">&quot;&quot;,</span> <span class="pre">add_citations:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False)</span> <span class="pre">-&gt;</span> <span class="pre">str</span></code>: Enables users to send messages to the client</p></li>
+<li><p><code class="docutils literal notranslate"><span class="pre">quiz(add_citations:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">True,</span> <span class="pre">num_questions:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">10)</span> <span class="pre">-&gt;</span> <span class="pre">str</span></code>: Generate a quiz based on full input memory</p></li>
+</ul>
+</li>
+<li><p>Key Attributes:</p>
+<ul>
+<li><p><code class="docutils literal notranslate"><span class="pre">knowledge_base</span></code>: List of URLs providing foundation knowledge</p></li>
+<li><p><code class="docutils literal notranslate"><span class="pre">input</span></code>: Current input being studied (short-term memory)</p></li>
+<li><p><code class="docutils literal notranslate"><span class="pre">input_memory</span></code>: Cumulative input + knowledge base (long-term memory)</p></li>
+<li><p><code class="docutils literal notranslate"><span class="pre">response</span></code>: Latest response from LLM</p></li>
+<li><p><code class="docutils literal notranslate"><span class="pre">response_memory</span></code>: Cumulative responses (long-term memory)</p></li>
+<li><p><code class="docutils literal notranslate"><span class="pre">urls_memory</span></code>: Cumulative list of processed URLs</p></li>
+</ul>
+</li>
+</ul>
+<p><strong>Corpus-in-Context Prompting</strong></p>
+<p>The <code class="docutils literal notranslate"><span class="pre">add()</span></code> method is key since it is used to add content to the client. It takes a list of URLs and extracts the content from each URL using a content extractor (using MarkitDown). The content is then added to the conversation input memory in a way that enables citations using the “Corpus-in-Context” (CIC) Prompting <span id="id13">[<a class="reference internal" href="#id101" title="Jinhyuk Lee, Anthony Chen, Zhuyun Dai, Dheeru Dua, Devendra Singh Sachan, Michael Boratko, Yi Luan, Sébastien M. R. Arnold, Vincent Perot, Siddharth Dalmia, Hexiang Hu, Xudong Lin, Panupong Pasupat, Aida Amini, Jeremy R. Cole, Sebastian Riedel, Iftekhar Naim, Ming-Wei Chang, and Kelvin Guu. Can long-context language models subsume retrieval, rag, sql, and more? 2024. URL: https://arxiv.org/abs/2406.13121, arXiv:2406.13121.">Lee <em>et al.</em>, 2024</a>]</span>.</p>
+<p><a class="reference internal" href="#cic"><span class="std std-numref">Fig. 5.9</span></a> shows how CIC format is used to enable citations. It inserts a corpus into the prompt. Each candidate citable part (e.g., passage, chapter) in a corpus is assigned a unique identifier (ID) that can be referenced as needed for that task.</p>
+<figure class="align-center" id="cic">
+<a class="reference internal image-reference" href="../_images/cic.png"><img alt="CIC Format" src="../_images/cic.png" style="width: 830.5px; height: 361.5px;" /></a>
+<figcaption>
+<p><span class="caption-number">Fig. 5.9 </span><span class="caption-text">Example of Corpus-in-Context Prompting for retrieval.</span><a class="headerlink" href="#cic" title="Permalink to this image">¶</a></p>
+</figcaption>
+</figure>
+<p>CiC prompting leverages LLM’s capacity to follow instructions by carefully annotating the corpus with document IDs. It benefits from a strong, capable models to retrieve over large corpora provided in context.</p>
+<div class="highlight-python notranslate"><div class="highlight"><pre><span></span>    <span class="k">def</span> <span class="nf">add</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">urls</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">urls</span> <span class="o">=</span> <span class="n">urls</span>
+
+        <span class="c1"># Add new content to input following CIC format to enable citations</span>
+        <span class="k">for</span> <span class="n">url</span> <span class="ow">in</span> <span class="n">urls</span><span class="p">:</span>
+            <span class="bp">self</span><span class="o">.</span><span class="n">urls_memory</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">url</span><span class="p">)</span>
+            <span class="n">content</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">extractor</span><span class="o">.</span><span class="n">convert</span><span class="p">(</span><span class="n">url</span><span class="p">)</span><span class="o">.</span><span class="n">text_content</span>
+            <span class="n">formatted_content</span> <span class="o">=</span> <span class="sa">f</span><span class="s2">&quot;ID: </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">reference_id</span><span class="si">}</span><span class="s2"> | </span><span class="si">{</span><span class="n">content</span><span class="si">}</span><span class="s2"> | END ID: </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">reference_id</span><span class="si">}</span><span class="s2">&quot;</span>
+            <span class="bp">self</span><span class="o">.</span><span class="n">input</span> <span class="o">+=</span> <span class="n">formatted_content</span> <span class="o">+</span> <span class="s2">&quot;</span><span class="se">\n</span><span class="s2">&quot;</span> 
+            <span class="bp">self</span><span class="o">.</span><span class="n">reference_id</span> <span class="o">+=</span> <span class="mi">1</span>
+        
+        <span class="c1"># Update memory</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">input_memory</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">input_memory</span> <span class="o">+</span> <span class="bp">self</span><span class="o">.</span><span class="n">input</span>
+</pre></div>
+</div>
+<p>The method <code class="docutils literal notranslate"><span class="pre">add_knowledge_base()</span></code> is a simple wrapper around the <code class="docutils literal notranslate"><span class="pre">add()</span></code> method. It is used to add URLs to the knowledge base, which are later cached by the LLM model as we will see later.</p>
+<div class="highlight-python notranslate"><div class="highlight"><pre><span></span>    <span class="k">def</span> <span class="nf">add_knowledge_base</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">urls</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">urls</span><span class="p">)</span>
+</pre></div>
+</div>
+<p>Later, when the user sends a message to the client, the <code class="docutils literal notranslate"><span class="pre">msg()</span></code> method is used to generate a response  while enabling citations. <code class="docutils literal notranslate"><span class="pre">self.content_generator</span></code> is an instance of our LLM model, which we will go through next.</p>
+<div class="highlight-python notranslate"><div class="highlight"><pre><span></span>    <span class="k">def</span> <span class="nf">msg</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">msg</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;&quot;</span><span class="p">,</span> <span class="n">add_citations</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">str</span><span class="p">:</span>
+        <span class="k">if</span> <span class="n">add_citations</span><span class="p">:</span>
+            <span class="n">msg</span> <span class="o">=</span> <span class="n">msg</span> <span class="o">+</span> <span class="s2">&quot;</span><span class="se">\n\n</span><span class="s2"> For key statements, add Input ID to the response.&quot;</span>
+
+        <span class="bp">self</span><span class="o">.</span><span class="n">response</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">content_generator</span><span class="o">.</span><span class="n">generate</span><span class="p">(</span>
+            <span class="n">input_content</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">input</span><span class="p">,</span>
+            <span class="n">user_instructions</span><span class="o">=</span><span class="n">msg</span>
+        <span class="p">)</span>
+
+        <span class="bp">self</span><span class="o">.</span><span class="n">response_memory</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">response_memory</span> <span class="o">+</span> <span class="bp">self</span><span class="o">.</span><span class="n">response</span><span class="o">.</span><span class="n">text</span>
+
+        <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">response</span><span class="o">.</span><span class="n">text</span>
+</pre></div>
+</div>
+<p><strong>Prompt Caching</strong></p>
+<p>LLM-based applications often involve repeatedly passing the same input tokens to a model, which can be inefficient and costly. Context caching addresses this by allowing you to cache input tokens after their first use and reference them in subsequent requests. This approach significantly reduces costs compared to repeatedly sending the same token corpus, especially at scale.</p>
+<p>In our application, the user might passes a large knowledge base to the client that can be referenced multiple times by smaller user requests. Our <code class="docutils literal notranslate"><span class="pre">Client</span></code> class is composed of a <code class="docutils literal notranslate"><span class="pre">LLMBackend</span></code> class that takes the <code class="docutils literal notranslate"><span class="pre">input_memory</span></code> containing the entire knowledge base and any additional user added content.</p>
+<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="bp">self</span><span class="o">.</span><span class="n">llm</span> <span class="o">=</span> <span class="n">LLMBackend</span><span class="p">(</span><span class="nb">input</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">input_memory</span><span class="p">)</span>
+</pre></div>
+</div>
+<p>In our <code class="docutils literal notranslate"><span class="pre">LLMBackend</span></code> Class, we leverage prompt caching on input tokens and uses them for subsequent requests.</p>
+<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="k">class</span> <span class="nc">LLMBackend</span><span class="p">:</span>
+    <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">model_name</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="nb">input</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">cache_ttl</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">60</span><span class="p">):</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">cache</span> <span class="o">=</span> <span class="n">caching</span><span class="o">.</span><span class="n">CachedContent</span><span class="o">.</span><span class="n">create</span><span class="p">(</span>
+            <span class="n">model</span><span class="o">=</span><span class="n">model_name</span><span class="p">,</span>
+            <span class="n">display_name</span><span class="o">=</span><span class="s1">&#39;due_knowledge_base&#39;</span><span class="p">,</span> <span class="c1"># used to identify the cache</span>
+            <span class="n">system_instruction</span><span class="o">=</span><span class="p">(</span>
+            <span class="bp">self</span><span class="o">.</span><span class="n">compose_prompt</span><span class="p">(</span><span class="nb">input</span><span class="p">,</span> <span class="n">conversation_config</span><span class="p">)</span>
+        <span class="p">),</span>
+        <span class="n">ttl</span><span class="o">=</span><span class="n">datetime</span><span class="o">.</span><span class="n">timedelta</span><span class="p">(</span><span class="n">minutes</span><span class="o">=</span><span class="n">cache_ttl</span><span class="p">),</span>
+    <span class="p">)</span>
+
+    <span class="bp">self</span><span class="o">.</span><span class="n">model</span> <span class="o">=</span> <span class="n">genai</span><span class="o">.</span><span class="n">GenerativeModel</span><span class="o">.</span><span class="n">from_cached_content</span><span class="p">(</span><span class="n">cached_content</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">cache</span><span class="p">)</span>
+</pre></div>
+</div>
+<p><strong>Quiz Generation</strong></p>
+<p>Coming back to our <code class="docutils literal notranslate"><span class="pre">Client</span></code> class, we implement the <code class="docutils literal notranslate"><span class="pre">quiz()</span></code> method to generate a quiz based on the full input memory, i.e. the initial knowledge base and any additional user added content.</p>
+<p>The <code class="docutils literal notranslate"><span class="pre">quiz()</span></code> method returns a <code class="docutils literal notranslate"><span class="pre">Quiz</span></code> instance which behind the scenes caches input tokens. The user later can invoke its <code class="docutils literal notranslate"><span class="pre">generate()</span></code> method to generate a quiz passing the user instructions in <code class="docutils literal notranslate"><span class="pre">msg</span></code> parameter, as we will see later.</p>
+<div class="highlight-python notranslate"><div class="highlight"><pre><span></span>    <span class="k">def</span> <span class="nf">quiz</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">add_citations</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> <span class="n">num_questions</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">10</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">str</span><span class="p">:</span>
+<span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">        Returns a quiz instance based on full input memory.</span>
+<span class="sd">        &quot;&quot;&quot;</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">quiz_instance</span> <span class="o">=</span> <span class="n">Quiz</span><span class="p">(</span>
+                         <span class="nb">input</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">input_memory</span><span class="p">,</span>
+                         <span class="n">add_citations</span><span class="o">=</span><span class="n">add_citations</span><span class="p">,</span>
+                         <span class="n">num_questions</span><span class="o">=</span><span class="n">num_questions</span><span class="p">)</span>
+        <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">quiz_instance</span>
+</pre></div>
+</div>
+<p>We write a simple prompt template for quiz generation:</p>
+<blockquote>
+<div><p>ROLE:</p>
+<ul class="simple">
+<li><p>You are a Harvard Professor providing a quiz.
+INSTRUCTIONS:</p></li>
+<li><p>Generate a quiz with {num_questions} questions based on the input.</p></li>
+<li><p>The quiz should be multi-choice.</p></li>
+<li><p>Answers should be provided at the end of the quiz.</p></li>
+<li><p>Questions should have broad coverage of the input including multiple Input IDs.</p></li>
+<li><p>Level of difficulty is advanced/hard.</p></li>
+<li></li>
+</ul>
+<p>STRUCTURE:</p>
+<ul class="simple">
+<li><p>Sequence of questions and alternatives.</p></li>
+<li><p>At the end provide the correct answers.</p></li>
+</ul>
+</div></blockquote>
+<p>where, <code class="docutils literal notranslate"><span class="pre">{citations}</span></code> instructs the model to add CiC citations to the response if user requests it.</p>
+</section>
+<section id="example-usage">
+<h4><a class="toc-backref" href="#id222" role="doc-backlink"><span class="section-number">5.4.3.3. </span>Example Usage</a><a class="headerlink" href="#example-usage" title="Permalink to this heading">¶</a></h4>
+<p><strong>Dataset</strong></p>
+<p>First, we will define our knowledge base.</p>
+<ul class="simple">
+<li><p>Harvard Class: <a class="reference external" href="https://scholar.harvard.edu/files/dlcammack/files/gov_1039_syllabus.pdf">GOV 1039 Syllabus</a></p></li>
+<li><p>Class / Topic: “Rights”</p></li>
+<li><p>Reading List:</p>
+<ul>
+<li><p>ID 1. The Declaration of Independence of the United States of America</p></li>
+<li><p>ID 2. The United States Bill of Rights</p></li>
+<li><p>ID 3. John F. Kennedy’s Inaugural Address</p></li>
+<li><p>ID 4. Lincoln’s Gettysburg Address</p></li>
+<li><p>ID 5. The United States Constitution</p></li>
+<li><p>ID 6. Give Me Liberty or Give Me Death</p></li>
+<li><p>ID 7. The Mayflower Compact</p></li>
+<li><p>ID 8. Abraham Lincoln’s Second Inaugural Address</p></li>
+<li><p>ID 9. Abraham Lincoln’s First Inaugural Address</p></li>
+</ul>
+</li>
+</ul>
+<p>We will take advantage of Project Gutenberg’s to create our knowledge base.</p>
+<div class="cell docutils container">
+<div class="cell_input docutils container">
+<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">kb</span> <span class="o">=</span> <span class="p">[</span><span class="sa">f</span><span class="s2">&quot;https://www.gutenberg.org/cache/epub/</span><span class="si">{</span><span class="n">i</span><span class="si">}</span><span class="s2">/pg</span><span class="si">{</span><span class="n">i</span><span class="si">}</span><span class="s2">.txt&quot;</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span><span class="mi">9</span><span class="p">)]</span>
+</pre></div>
+</div>
+</div>
+</div>
+<p>We will import our module <code class="docutils literal notranslate"><span class="pre">gemini_duo</span></code> as <code class="docutils literal notranslate"><span class="pre">genai_duo</span></code> and initialize the <code class="docutils literal notranslate"><span class="pre">Client</span></code> class with our knowledge base.</p>
+<div class="cell docutils container">
+<div class="cell_input docutils container">
+<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">gemini_duo</span> <span class="k">as</span> <span class="nn">genai_duo</span>
+<span class="kn">from</span> <span class="nn">IPython.display</span> <span class="kn">import</span> <span class="n">Markdown</span><span class="p">,</span> <span class="n">display</span>
+</pre></div>
+</div>
+</div>
+</div>
+<div class="cell docutils container">
+<div class="cell_input docutils container">
+<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">duo</span> <span class="o">=</span> <span class="n">genai_duo</span><span class="o">.</span><span class="n">Client</span><span class="p">(</span><span class="n">knowledge_base</span><span class="o">=</span><span class="n">kb</span><span class="p">)</span>
+</pre></div>
+</div>
+</div>
+</div>
+<p>At this point, we converted each book into markdown using MarkitDown and cached the content in our LLM model. We can access how many tokens we have cached in our LLM model by looking at the <code class="docutils literal notranslate"><span class="pre">usage_metadata</span></code> attribute of the Gemini’s model response. At this point, we have cached at total of 38470 tokens.</p>
+<p>Now, we can add references to our knowledge base at anytime by calling the <code class="docutils literal notranslate"><span class="pre">add()</span></code> method. We add the following references:</p>
+<ol class="arabic simple">
+<li><p>The Magna Carta</p></li>
+<li><p>William Shap McKechnie on Magna Carta book</p></li>
+</ol>
+<div class="cell docutils container">
+<div class="cell_input docutils container">
+<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">study_references</span> <span class="o">=</span> <span class="p">[</span><span class="s2">&quot;https://www.gutenberg.org/cache/epub/10000/pg10000.txt&quot;</span><span class="p">,</span> <span class="s2">&quot;https://www.gutenberg.org/cache/epub/65363/pg65363.txt&quot;</span><span class="p">]</span>
+
+<span class="n">duo</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">study_references</span><span class="p">)</span>
+</pre></div>
+</div>
+</div>
+</div>
+<p>Now we can instantiate a <code class="docutils literal notranslate"><span class="pre">Quiz</span></code> object and generate a quiz based on the full input memory.</p>
+<div class="cell docutils container">
+<div class="cell_input docutils container">
+<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">quiz</span> <span class="o">=</span> <span class="n">duo</span><span class="o">.</span><span class="n">quiz</span><span class="p">(</span><span class="n">add_citations</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
+<span class="n">display</span><span class="p">(</span><span class="n">Markdown</span><span class="p">(</span><span class="n">quiz</span><span class="o">.</span><span class="n">generate</span><span class="p">()))</span>
+</pre></div>
+</div>
+</div>
+</div>
+<p><a class="reference internal" href="#quiz"><span class="std std-numref">Fig. 5.10</span></a> shows a sample quiz with citations. Marked in yellow are the citations which refer to the input IDs of the resources we added to the model.</p>
+<figure class="align-center" id="quiz">
+<a class="reference internal image-reference" href="../_images/quiz.png"><img alt="Quiz with Citations" src="../_images/quiz.png" style="width: 891.0px; height: 772.0px;" /></a>
+<figcaption>
+<p><span class="caption-number">Fig. 5.10 </span><span class="caption-text">Sample Quiz with Citations.</span><a class="headerlink" href="#quiz" title="Permalink to this image">¶</a></p>
+</figcaption>
+</figure>
+</section>
+<section id="id14">
+<h4><a class="toc-backref" href="#id223" role="doc-backlink"><span class="section-number">5.4.3.4. </span>Discussion</a><a class="headerlink" href="#id14" title="Permalink to this heading">¶</a></h4>
+<p>The experiment demonstrated the ability to build a knowledge base from multiple sources while leveraging prompt caching for efficiency and generate quizzes with citations for verifiability. The system successfully ingested content from Project Gutenberg texts, including historical documents like the Magna Carta, and used them to create interactive educational content.</p>
+<p>However, several limitations emerged during this process:</p>
+<ol class="arabic simple">
+<li><p>Memory Management: The system currently loads all content into memory, which could become problematic with larger knowledge bases. A more scalable approach might involve chunking or streaming the content.</p></li>
+<li><p>Citation Quality: While the system provides citations, they lack specificity - pointing to entire documents rather than specific passages or page numbers. This limits the ability to fact-check or verify specific claims.</p></li>
+<li><p>Content Verification: While citations are provided, the system is not guaranteed to provide factual information. This could lead to potential hallucinations or misinterpretations.</p></li>
+</ol>
+<p>While limitations are present in this simple example, the case study highlights that not always complex systems are needed. Alternative simple strategies should be preferred when possible, particularly if capable, long-context window models are available and fit within the application requirements.</p>
+</section>
+</section>
+</section>
+<section id="conclusion">
+<h2><a class="toc-backref" href="#id224" role="doc-backlink"><span class="section-number">5.5. </span>Conclusion</a><a class="headerlink" href="#conclusion" title="Permalink to this heading">¶</a></h2>
+<p><a class="reference external" href="http://creativecommons.org/licenses/by-nc-sa/4.0/"><img alt="CC BY-NC-SA 4.0" src="https://licensebuttons.net/l/by-nc-sa/4.0/88x31.png" /></a></p>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="nd">@misc</span><span class="p">{</span><span class="n">tharsistpsouza2024tamingllms</span><span class="p">,</span>
+  <span class="n">author</span> <span class="o">=</span> <span class="p">{</span><span class="n">Tharsis</span> <span class="n">T</span><span class="o">.</span> <span class="n">P</span><span class="o">.</span> <span class="n">Souza</span><span class="p">},</span>
+  <span class="n">title</span> <span class="o">=</span> <span class="p">{</span><span class="n">Taming</span> <span class="n">LLMs</span><span class="p">:</span> <span class="n">A</span> <span class="n">Practical</span> <span class="n">Guide</span> <span class="n">to</span> <span class="n">LLM</span> <span class="n">Pitfalls</span> <span class="k">with</span> <span class="n">Open</span> <span class="n">Source</span> <span class="n">Software</span><span class="p">},</span>
+  <span class="n">year</span> <span class="o">=</span> <span class="p">{</span><span class="mi">2024</span><span class="p">},</span>
+  <span class="n">chapter</span> <span class="o">=</span> <span class="p">{</span><span class="n">Managing</span> <span class="n">Input</span> <span class="n">Data</span><span class="p">},</span>
+  <span class="n">journal</span> <span class="o">=</span> <span class="p">{</span><span class="n">GitHub</span> <span class="n">repository</span><span class="p">},</span>
+  <span class="n">url</span> <span class="o">=</span> <span class="p">{</span><span class="n">https</span><span class="p">:</span><span class="o">//</span><span class="n">github</span><span class="o">.</span><span class="n">com</span><span class="o">/</span><span class="n">souzatharsis</span><span class="o">/</span><span class="n">tamingLLMs</span><span class="p">)</span>
+<span class="p">}</span>
+</pre></div>
+</div>
+</section>
+<section id="references">
+<h2><a class="toc-backref" href="#id225" role="doc-backlink"><span class="section-number">5.6. </span>References</a><a class="headerlink" href="#references" title="Permalink to this heading">¶</a></h2>
+<div class="docutils container" id="id15">
+<div class="citation" id="id146" role="doc-biblioentry">
+<span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id3">AWP+24</a><span class="fn-bracket">]</span></span>
+<p>Alfonso Amayuelas, Kyle Wong, Liangming Pan, Wenhu Chen, and William Yang Wang. Knowledge of knowledge: exploring known-unknowns uncertainty with large language models. In Lun-Wei Ku, Andre Martins, and Vivek Srikumar, editors, <em>Findings of the Association for Computational Linguistics: ACL 2024</em>, 6416–6432. Bangkok, Thailand, August 2024. Association for Computational Linguistics. URL: <a class="reference external" href="https://aclanthology.org/2024.findings-acl.383">https://aclanthology.org/2024.findings-acl.383</a>, <a class="reference external" href="https://doi.org/10.18653/v1/2024.findings-acl.383">doi:10.18653/v1/2024.findings-acl.383</a>.</p>
+</div>
+<div class="citation" id="id147" role="doc-biblioentry">
+<span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id4">KSR24</a><span class="fn-bracket">]</span></span>
+<p>Suhas Kotha, Jacob Mitchell Springer, and Aditi Raghunathan. Understanding catastrophic forgetting in language models via implicit inference. In <em>The Twelfth International Conference on Learning Representations</em>. 2024. URL: <a class="reference external" href="https://openreview.net/forum?id=VrHiF2hsrm">https://openreview.net/forum?id=VrHiF2hsrm</a>.</p>
+</div>
+<div class="citation" id="id101" role="doc-biblioentry">
+<span class="label"><span class="fn-bracket">[</span>LCD+24<span class="fn-bracket">]</span></span>
+<span class="backrefs">(<a role="doc-backlink" href="#id1">1</a>,<a role="doc-backlink" href="#id13">2</a>)</span>
+<p>Jinhyuk Lee, Anthony Chen, Zhuyun Dai, Dheeru Dua, Devendra Singh Sachan, Michael Boratko, Yi Luan, Sébastien M. R. Arnold, Vincent Perot, Siddharth Dalmia, Hexiang Hu, Xudong Lin, Panupong Pasupat, Aida Amini, Jeremy R. Cole, Sebastian Riedel, Iftekhar Naim, Ming-Wei Chang, and Kelvin Guu. Can long-context language models subsume retrieval, rag, sql, and more? 2024. URL: <a class="reference external" href="https://arxiv.org/abs/2406.13121">https://arxiv.org/abs/2406.13121</a>, <a class="reference external" href="https://arxiv.org/abs/2406.13121">arXiv:2406.13121</a>.</p>
+</div>
+<div class="citation" id="id99" role="doc-biblioentry">
+<span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id8">LPP+21</a><span class="fn-bracket">]</span></span>
+<p>Patrick Lewis, Ethan Perez, Aleksandra Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau Yih, Tim Rocktäschel, Sebastian Riedel, and Douwe Kiela. Retrieval-augmented generation for knowledge-intensive nlp tasks. 2021. URL: <a class="reference external" href="https://arxiv.org/abs/2005.11401">https://arxiv.org/abs/2005.11401</a>, <a class="reference external" href="https://arxiv.org/abs/2005.11401">arXiv:2005.11401</a>.</p>
+</div>
+<div class="citation" id="id148" role="doc-biblioentry">
+<span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id9">NBGC24</a><span class="fn-bracket">]</span></span>
+<p>Shiyu Ni, Keping Bi, Jiafeng Guo, and Xueqi Cheng. When do LLMs need retrieval augmentation? mitigating LLMs' overconfidence helps retrieval augmentation. In Lun-Wei Ku, Andre Martins, and Vivek Srikumar, editors, <em>Findings of the Association for Computational Linguistics: ACL 2024</em>, 11375–11388. Bangkok, Thailand, August 2024. Association for Computational Linguistics. URL: <a class="reference external" href="https://aclanthology.org/2024.findings-acl.675">https://aclanthology.org/2024.findings-acl.675</a>, <a class="reference external" href="https://doi.org/10.18653/v1/2024.findings-acl.675">doi:10.18653/v1/2024.findings-acl.675</a>.</p>
+</div>
+<div class="citation" id="id143" role="doc-biblioentry">
+<span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id2">TDW+24</a><span class="fn-bracket">]</span></span>
+<p>Jiejun Tan, Zhicheng Dou, Wen Wang, Mang Wang, Weipeng Chen, and Ji-Rong Wen. Htmlrag: html is better than plain text for modeling retrieved knowledge in rag systems. 2024. URL: <a class="reference external" href="https://arxiv.org/abs/2411.02959">https://arxiv.org/abs/2411.02959</a>, <a class="reference external" href="https://arxiv.org/abs/2411.02959">arXiv:2411.02959</a>.</p>
+</div>
+<div class="citation" id="id142" role="doc-biblioentry">
+<span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id9">ZLJ+24</a><span class="fn-bracket">]</span></span>
+<p>Yujia Zhou, Zheng Liu, Jiajie Jin, Jian-Yun Nie, and Zhicheng Dou. Metacognitive retrieval-augmented large language models. In <em>Proceedings of the ACM Web Conference 2024</em>, WWW '24, 1453–1463. New York, NY, USA, 2024. Association for Computing Machinery. URL: <a class="reference external" href="https://doi.org/10.1145/3589334.3645481">https://doi.org/10.1145/3589334.3645481</a>, <a class="reference external" href="https://doi.org/10.1145/3589334.3645481">doi:10.1145/3589334.3645481</a>.</p>
+</div>
+<div class="citation" id="id144" role="doc-biblioentry">
+<span class="label"><span class="fn-bracket">[</span>Anthropic24<span class="fn-bracket">]</span></span>
+<span class="backrefs">(<a role="doc-backlink" href="#id11">1</a>,<a role="doc-backlink" href="#id12">2</a>)</span>
+<p>Anthropic. Introducing contextual retrieval. 09 2024. URL: <a class="reference external" href="https://www.anthropic.com/news/contextual-retrieval">https://www.anthropic.com/news/contextual-retrieval</a>.</p>
+</div>
+<div class="citation" id="id51" role="doc-biblioentry">
+<span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id10">LangChain24</a><span class="fn-bracket">]</span></span>
+<p>LangChain. Text splitters - langchain documentation. <a class="reference external" href="https://python.langchain.com/docs/how_to/#text-splitters">https://python.langchain.com/docs/how_to/#text-splitters</a>, 2024. Accessed: 12/07/2024.</p>
+</div>
+<div class="citation" id="id102" role="doc-biblioentry">
+<span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id5">MerrillLynch24</a><span class="fn-bracket">]</span></span>
+<p>Merrill Lynch. Chief investment officer capital market outlook. CIO Weekly Letter, 2024. URL: <a class="reference external" href="https://olui2.fs.ml.com/publish/content/application/pdf/gwmol/me-cio-weekly-letter.pdf">https://olui2.fs.ml.com/publish/content/application/pdf/gwmol/me-cio-weekly-letter.pdf</a>.</p>
+</div>
+</div>
+</div>
+</section>
+</section>
+
+    <script type="text/x-thebe-config">
+    {
+        requestKernel: true,
+        binderOptions: {
+            repo: "binder-examples/jupyter-stacks-datascience",
+            ref: "master",
+        },
+        codeMirrorConfig: {
+            theme: "abcdef",
+            mode: "python"
+        },
+        kernelOptions: {
+            name: "python3",
+            path: "./notebooks"
+        },
+        predefinedOutput: true
+    }
+    </script>
+    <script>kernelName = 'python3'</script>
+
+          </div>
+          <div class="page-nav">
+            <div class="inner"><ul class="page-nav">
+  <li class="prev">
+    <a href="structured_output.html"
+       title="previous chapter">← <span class="section-number">4. </span>Structured Output</a>
+  </li>
+  <li class="next">
+    <a href="safety.html"
+       title="next chapter"><span class="section-number">6. </span>Safety →</a>
+  </li>
+</ul><div class="footer" role="contentinfo">
+      &#169; Copyright Tharsis T. P. Souza, 2024.
+    <br>
+    Created using <a href="http://sphinx-doc.org/">Sphinx</a> 6.2.1 with <a href="https://github.com/schettino72/sphinx_press_theme">Press Theme</a> 0.9.1.
+</div>
+            </div>
+          </div>
+      </page>
+    </div></div>
+    
+    
+  </body>
+</html>
\ No newline at end of file
diff --git a/tamingllms/_build/html/notebooks/local.html b/tamingllms/_build/html/notebooks/local.html
index 6e29755..24d33d7 100644
--- a/tamingllms/_build/html/notebooks/local.html
+++ b/tamingllms/_build/html/notebooks/local.html
@@ -4,7 +4,7 @@
     <meta charset="utf-8">
     <meta name="viewport" content="width=device-width,initial-scale=1"><meta name="generator" content="Docutils 0.19: https://docutils.sourceforge.io/" />
 
-      <title>7. Local LLMs in Practice</title>
+      <title>8. Local LLMs in Practice</title>
     
           <link rel="stylesheet" href="../_static/pygments.css" type="text/css" />
           <link rel="stylesheet" href="../_static/theme.css " type="text/css" />
@@ -39,8 +39,8 @@
     
   <link rel="index" title="Index" href="../genindex.html" />
   <link rel="search" title="Search" href="../search.html" />
-  <link rel="next" title="8. The Falling Cost Paradox" href="cost.html" />
-  <link rel="prev" title="6. Preference-Based Alignment" href="alignment.html" /> 
+  <link rel="next" title="9. The Falling Cost Paradox" href="cost.html" />
+  <link rel="prev" title="7. Preference-Based Alignment" href="alignment.html" /> 
   </head>
 
   <body>
@@ -156,6 +156,15 @@
             
 
             
+          </li>
+
+        
+          <li class="toctree-l1 ">
+            
+              <a href="input.html" class="reference internal ">Managing Input Data</a>
+            
+
+            
           </li>
 
         
@@ -223,18 +232,18 @@
   <ul class="breadcrumbs">
     <li><a href="../markdown/toc.html">Docs</a> &raquo;</li>
     
-    <li><span class="section-number">7. </span>Local LLMs in Practice</li>
+    <li><span class="section-number">8. </span>Local LLMs in Practice</li>
   </ul>
   
 
   <ul class="page-nav">
   <li class="prev">
     <a href="alignment.html"
-       title="previous chapter">← <span class="section-number">6. </span>Preference-Based Alignment</a>
+       title="previous chapter">← <span class="section-number">7. </span>Preference-Based Alignment</a>
   </li>
   <li class="next">
     <a href="cost.html"
-       title="next chapter"><span class="section-number">8. </span>The Falling Cost Paradox →</a>
+       title="next chapter"><span class="section-number">9. </span>The Falling Cost Paradox →</a>
   </li>
 </ul>
   
@@ -243,7 +252,7 @@
           <div class="content" role="main" v-pre>
             
   <section class="tex2jax_ignore mathjax_ignore" id="local-llms-in-practice">
-<span id="local"></span><h1><a class="toc-backref" href="#id225" role="doc-backlink"><span class="section-number">7. </span>Local LLMs in Practice</a><a class="headerlink" href="#local-llms-in-practice" title="Permalink to this heading">¶</a></h1>
+<span id="local"></span><h1><a class="toc-backref" href="#id232" role="doc-backlink"><span class="section-number">8. </span>Local LLMs in Practice</a><a class="headerlink" href="#local-llms-in-practice" title="Permalink to this heading">¶</a></h1>
 <blockquote class="epigraph">
 <div><p>Freedom is something that dies unless it’s used.</p>
 <p class="attribution">—Hunter S. Thompson</p>
@@ -251,55 +260,55 @@
 <nav class="contents" id="contents">
 <p class="topic-title">Contents</p>
 <ul class="simple">
-<li><p><a class="reference internal" href="#local-llms-in-practice" id="id225">Local LLMs in Practice</a></p>
+<li><p><a class="reference internal" href="#local-llms-in-practice" id="id232">Local LLMs in Practice</a></p>
 <ul>
-<li><p><a class="reference internal" href="#introduction" id="id226">Introduction</a></p></li>
-<li><p><a class="reference internal" href="#choosing-your-model" id="id227">Choosing your Model</a></p>
+<li><p><a class="reference internal" href="#introduction" id="id233">Introduction</a></p></li>
+<li><p><a class="reference internal" href="#choosing-your-model" id="id234">Choosing your Model</a></p>
 <ul>
-<li><p><a class="reference internal" href="#task-suitability" id="id228">Task Suitability</a></p></li>
-<li><p><a class="reference internal" href="#performance-cost" id="id229">Performance &amp; Cost</a></p></li>
-<li><p><a class="reference internal" href="#licensing" id="id230">Licensing</a></p></li>
-<li><p><a class="reference internal" href="#community-support" id="id231">Community Support</a></p></li>
-<li><p><a class="reference internal" href="#customization" id="id232">Customization</a></p></li>
+<li><p><a class="reference internal" href="#task-suitability" id="id235">Task Suitability</a></p></li>
+<li><p><a class="reference internal" href="#performance-cost" id="id236">Performance &amp; Cost</a></p></li>
+<li><p><a class="reference internal" href="#licensing" id="id237">Licensing</a></p></li>
+<li><p><a class="reference internal" href="#community-support" id="id238">Community Support</a></p></li>
+<li><p><a class="reference internal" href="#customization" id="id239">Customization</a></p></li>
 </ul>
 </li>
-<li><p><a class="reference internal" href="#tools-for-local-llm-deployment" id="id233">Tools for Local LLM Deployment</a></p>
+<li><p><a class="reference internal" href="#tools-for-local-llm-deployment" id="id240">Tools for Local LLM Deployment</a></p>
 <ul>
-<li><p><a class="reference internal" href="#serving-models" id="id234">Serving Models</a></p>
+<li><p><a class="reference internal" href="#serving-models" id="id241">Serving Models</a></p>
 <ul>
-<li><p><a class="reference internal" href="#llama-cpp" id="id235">LLama.cpp</a></p></li>
-<li><p><a class="reference internal" href="#llamafile" id="id236">Llamafile</a></p></li>
-<li><p><a class="reference internal" href="#ollama" id="id237">Ollama</a></p></li>
-<li><p><a class="reference internal" href="#comparison" id="id238">Comparison</a></p></li>
+<li><p><a class="reference internal" href="#llama-cpp" id="id242">LLama.cpp</a></p></li>
+<li><p><a class="reference internal" href="#llamafile" id="id243">Llamafile</a></p></li>
+<li><p><a class="reference internal" href="#ollama" id="id244">Ollama</a></p></li>
+<li><p><a class="reference internal" href="#comparison" id="id245">Comparison</a></p></li>
 </ul>
 </li>
-<li><p><a class="reference internal" href="#ui" id="id239">UI</a></p>
+<li><p><a class="reference internal" href="#ui" id="id246">UI</a></p>
 <ul>
-<li><p><a class="reference internal" href="#lm-studio" id="id240">LM Studio</a></p></li>
-<li><p><a class="reference internal" href="#jan" id="id241">Jan</a></p></li>
-<li><p><a class="reference internal" href="#open-webui" id="id242">Open WebUI</a></p></li>
-<li><p><a class="reference internal" href="#id37" id="id243">Comparison</a></p></li>
+<li><p><a class="reference internal" href="#lm-studio" id="id247">LM Studio</a></p></li>
+<li><p><a class="reference internal" href="#jan" id="id248">Jan</a></p></li>
+<li><p><a class="reference internal" href="#open-webui" id="id249">Open WebUI</a></p></li>
+<li><p><a class="reference internal" href="#id37" id="id250">Comparison</a></p></li>
 </ul>
 </li>
 </ul>
 </li>
-<li><p><a class="reference internal" href="#case-study-the-effect-of-quantization-on-llm-performance" id="id244">Case Study: The Effect of Quantization on LLM Performance</a></p>
+<li><p><a class="reference internal" href="#case-study-the-effect-of-quantization-on-llm-performance" id="id251">Case Study: The Effect of Quantization on LLM Performance</a></p>
 <ul>
-<li><p><a class="reference internal" href="#prompts-dataset" id="id245">Prompts Dataset</a></p></li>
-<li><p><a class="reference internal" href="#quantization" id="id246">Quantization</a></p></li>
-<li><p><a class="reference internal" href="#benchmarking" id="id247">Benchmarking</a></p></li>
-<li><p><a class="reference internal" href="#results" id="id248">Results</a></p></li>
-<li><p><a class="reference internal" href="#takeaways" id="id249">Takeaways</a></p></li>
+<li><p><a class="reference internal" href="#prompts-dataset" id="id252">Prompts Dataset</a></p></li>
+<li><p><a class="reference internal" href="#quantization" id="id253">Quantization</a></p></li>
+<li><p><a class="reference internal" href="#benchmarking" id="id254">Benchmarking</a></p></li>
+<li><p><a class="reference internal" href="#results" id="id255">Results</a></p></li>
+<li><p><a class="reference internal" href="#takeaways" id="id256">Takeaways</a></p></li>
 </ul>
 </li>
-<li><p><a class="reference internal" href="#conclusion" id="id250">Conclusion</a></p></li>
-<li><p><a class="reference internal" href="#references" id="id251">References</a></p></li>
+<li><p><a class="reference internal" href="#conclusion" id="id257">Conclusion</a></p></li>
+<li><p><a class="reference internal" href="#references" id="id258">References</a></p></li>
 </ul>
 </li>
 </ul>
 </nav>
 <section id="introduction">
-<h2><a class="toc-backref" href="#id226" role="doc-backlink"><span class="section-number">7.1. </span>Introduction</a><a class="headerlink" href="#introduction" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id233" role="doc-backlink"><span class="section-number">8.1. </span>Introduction</a><a class="headerlink" href="#introduction" title="Permalink to this heading">¶</a></h2>
 <p>Running Open Source LLMs locally versus depending on proprietary cloud-based models represents more than just a technical choice - it’s a fundamental re-imagining of how we interact with AI technology, putting control back in the hands of users.</p>
 <p>Privacy concerns are a key driver for running LLMs locally. Individual users may want to process personal documents, photos, emails, and chat messages without sharing sensitive data with third parties. For enterprise use cases, organizations handling medical records must comply with HIPAA regulations that require data to remain on-premise. Similarly, businesses processing confidential documents and intellectual property, as well as organizations subject to GDPR and other privacy regulations, need to maintain strict control over their data processing pipeline.</p>
 <p>Cost considerations are another key driver. Organizations and individual consumers can better control expenses by matching model capabilities to their specific needs rather than paying for multiple cloud API subscriptions. For organizations with high-volume applications, this customization and control over costs becomes especially valuable compared to the often prohibitive per-request pricing of cloud solutions. For consumers, running multiple open source models locally eliminates the need to maintain separate subscriptions to access different model capabilities.</p>
@@ -309,7 +318,7 @@ <h2><a class="toc-backref" href="#id226" role="doc-backlink"><span class="sectio
 <p>We also cover key tools enabling local model serving and inference, including open source solutions such as LLama.cpp, Llamafile, and Ollama, along with user-friendly frontend interfaces that make local LLM usage more accessible. We conclude with a detailed case study, analyzing how different quantization approaches impact model performance in resource-constrained environments. This analysis reveals the critical tradeoffs between model size, inference speed, and output quality that practitioners must navigate.</p>
 </section>
 <section id="choosing-your-model">
-<span id="local-model-selection"></span><h2><a class="toc-backref" href="#id227" role="doc-backlink"><span class="section-number">7.2. </span>Choosing your Model</a><a class="headerlink" href="#choosing-your-model" title="Permalink to this heading">¶</a></h2>
+<span id="local-model-selection"></span><h2><a class="toc-backref" href="#id234" role="doc-backlink"><span class="section-number">8.2. </span>Choosing your Model</a><a class="headerlink" href="#choosing-your-model" title="Permalink to this heading">¶</a></h2>
 <p>The landscape of open source LLMs is rapidly evolving, with new models emerging by the day. While proprietary LLMs have garnered significant attention, open source LLMs are gaining traction due to their flexibility, customization options, and cost-effectiveness.</p>
 <p>It is important to observe long-term strategic considerations when choosing a model. These entails prioritization dimensions that may enable competitive advantage in the long-term, including:</p>
 <ol class="arabic simple">
@@ -321,7 +330,7 @@ <h2><a class="toc-backref" href="#id226" role="doc-backlink"><span class="sectio
 </ol>
 <p>In this section, we aim to provide a comprehensive set of considerations to selecting the right open-source LLM for your specific needs, emphasizing the importance of aligning the LLM’s capabilities with the intended task and considering resources constraints.</p>
 <section id="task-suitability">
-<h3><a class="toc-backref" href="#id228" role="doc-backlink"><span class="section-number">7.2.1. </span>Task Suitability</a><a class="headerlink" href="#task-suitability" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id235" role="doc-backlink"><span class="section-number">8.2.1. </span>Task Suitability</a><a class="headerlink" href="#task-suitability" title="Permalink to this heading">¶</a></h3>
 <p>When evaluating an open source LLM, task suitability is a critical first consideration. A model that performs well on general benchmarks may struggle with specific domain tasks. Understanding the intended use case helps narrow down model options based on their demonstrated strengths.</p>
 <p><strong>Task Categories</strong></p>
 <p>When determining which LLM task to prioritize, carefully consider your specific use case and end-user needs. Different applications require distinct model capabilities and optimizations. Common LLM Task Categories include:</p>
@@ -335,15 +344,15 @@ <h3><a class="toc-backref" href="#id228" role="doc-backlink"><span class="sectio
 <li><p><strong>Text Classification</strong>: Categorizing and labeling text data for sentiment analysis, topic modeling, and content moderation.</p></li>
 <li><p><strong>Named Entity Recognition</strong>: Identifying and extracting specific entities from text, such as people, organizations, and locations.</p></li>
 </ul>
-<p><a class="reference internal" href="#task-number"><span class="std std-numref">Fig. 7.1</span></a> shows the number models per task category available at Hugging Face as of December 22, 2024 <span id="id2">[<a class="reference internal" href="#id46" title="Hugging Face. Open source ai year in review 2024. https://huggingface.co/spaces/huggingface/open-source-ai-year-in-review-2024, 2024t. Accessed: 2024.">Face, 2024t</a>]</span>. Text generation is by far the most popular task category.</p>
+<p><a class="reference internal" href="#task-number"><span class="std std-numref">Fig. 8.1</span></a> shows the number models per task category available at Hugging Face as of December 22, 2024 <span id="id2">[<a class="reference internal" href="#id46" title="Hugging Face. Open source ai year in review 2024. https://huggingface.co/spaces/huggingface/open-source-ai-year-in-review-2024, 2024t. Accessed: 2024.">Face, 2024t</a>]</span>. Text generation is by far the most popular task category.</p>
 <figure class="align-center" id="task-number">
 <a class="reference internal image-reference" href="../_images/task_number.png"><img alt="Task Number" src="../_images/task_number.png" style="width: 471.20000000000005px; height: 270.0px;" /></a>
 <figcaption>
-<p><span class="caption-number">Fig. 7.1 </span><span class="caption-text">Number of models per task category from Hugging Face as of December 22, 2024 <span id="id3">[<a class="reference internal" href="#id46" title="Hugging Face. Open source ai year in review 2024. https://huggingface.co/spaces/huggingface/open-source-ai-year-in-review-2024, 2024t. Accessed: 2024.">Face, 2024t</a>]</span>.</span><a class="headerlink" href="#task-number" title="Permalink to this image">¶</a></p>
+<p><span class="caption-number">Fig. 8.1 </span><span class="caption-text">Number of models per task category from Hugging Face as of December 22, 2024 <span id="id3">[<a class="reference internal" href="#id46" title="Hugging Face. Open source ai year in review 2024. https://huggingface.co/spaces/huggingface/open-source-ai-year-in-review-2024, 2024t. Accessed: 2024.">Face, 2024t</a>]</span>.</span><a class="headerlink" href="#task-number" title="Permalink to this image">¶</a></p>
 </figcaption>
 </figure>
 <p><strong>Model Types</strong></p>
-<p>Open source LLMs can be broadly categorized into three main types as far as they level of customization is concerned, each with distinct characteristics and use cases (see <a class="reference internal" href="#model-types"><span class="std std-numref">Fig. 7.2</span></a>):</p>
+<p>Open source LLMs can be broadly categorized into three main types as far as they level of customization is concerned, each with distinct characteristics and use cases (see <a class="reference internal" href="#model-types"><span class="std std-numref">Fig. 8.2</span></a>):</p>
 <ul class="simple">
 <li><p><strong>Base Models</strong>: These foundation models provide broad language understanding capabilities but typically require additional fine-tuning to excel at specific tasks. They serve as versatile starting points for customization. Examples: meta-llama/Llama-2-70b, Qwen/Qwen2.5-72B</p></li>
 <li><p><strong>Instruction-Tuned Models</strong>: Enhanced through fine-tuning on instruction-following datasets, these models excel at interpreting and executing explicit prompts and commands. They bridge the gap between general language capabilities and practical task execution. Chat models are a good example of this category. Examples: meta-llama/Llama-2-70b-chat-hf (Chat), Qwen/Qwen2.5-72B-Instruct</p></li>
@@ -352,13 +361,13 @@ <h3><a class="toc-backref" href="#id228" role="doc-backlink"><span class="sectio
 <figure class="align-center" id="model-types">
 <a class="reference internal image-reference" href="../_images/model_types.svg"><img alt="Model Types" height="240" src="../_images/model_types.svg" width="924" /></a>
 <figcaption>
-<p><span class="caption-number">Fig. 7.2 </span><span class="caption-text">Model Types.</span><a class="headerlink" href="#model-types" title="Permalink to this image">¶</a></p>
+<p><span class="caption-number">Fig. 8.2 </span><span class="caption-text">Model Types.</span><a class="headerlink" href="#model-types" title="Permalink to this image">¶</a></p>
 </figcaption>
 </figure>
-<p>The Llama 2 model family <span id="id4">[<a class="reference internal" href="#id169" title="Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cristian Canton Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez, Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushkar Mishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing Ellen Tan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, and Thomas Scialom. Llama 2: open foundation and fine-tuned chat models. 2023. URL: https://arxiv.org/abs/2307.09288, arXiv:2307.09288.">Touvron <em>et al.</em>, 2023</a>]</span> illustrates these distinctions well. The base Llama 2, trained on 2 trillion tokens of public data, demonstrates general-purpose capabilities across text generation and translation tasks. Its chat-optimized instruction-tuned variant, Llama 2-Chat, underwent additional fine-tuning on over 1 million human-annotated conversational examples, making it particularly adept at natural dialogue.</p>
-<p>Benchmark results <span id="id5">[<a class="reference internal" href="#id116" title="Meta AI. Llama-2-70b-chat-hf. Hugging Face Model, 2024c. 70 billion parameter chat model from Meta's Llama 2 family. URL: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf.">Meta AI, 2024c</a>]</span> in <a class="reference internal" href="#llama2-benchmark"><span class="std std-numref">Table 7.1</span></a> highlight the impact of model specialization. On the TruthfulQA <span id="id6">[<a class="reference internal" href="safety.html#id108" title="Stephanie Lin, Jacob Hilton, and Owain Evans. Truthfulqa: measuring how models mimic human falsehoods. 2022. URL: https://arxiv.org/abs/2109.07958, arXiv:2109.07958.">Lin <em>et al.</em>, 2022</a>]</span> and Toxigen <span id="id7">[<a class="reference internal" href="#id121" title="Khalid Alnajjar and others. Toxigen dataset. Papers with Code Dataset, 2024. Dataset for evaluating and mitigating toxic language generation in language models. URL: https://paperswithcode.com/dataset/toxigen.">Alnajjar and others, 2024</a>]</span> benchmarks measuring truthful and informative responses. We observe that the chat-optimized variants show substantially improved truthfulness. Similarly, on the ToxiGen benchmark measuring toxic content generation, Llama 2-Chat models demonstrate near-zero toxicity compared to base models’ 21-26% rates.</p>
+<p>The Llama 2 model family <span id="id4">[<a class="reference internal" href="#id176" title="Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cristian Canton Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez, Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushkar Mishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing Ellen Tan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, and Thomas Scialom. Llama 2: open foundation and fine-tuned chat models. 2023. URL: https://arxiv.org/abs/2307.09288, arXiv:2307.09288.">Touvron <em>et al.</em>, 2023</a>]</span> illustrates these distinctions well. The base Llama 2, trained on 2 trillion tokens of public data, demonstrates general-purpose capabilities across text generation and translation tasks. Its chat-optimized instruction-tuned variant, Llama 2-Chat, underwent additional fine-tuning on over 1 million human-annotated conversational examples, making it particularly adept at natural dialogue.</p>
+<p>Benchmark results <span id="id5">[<a class="reference internal" href="#id116" title="Meta AI. Llama-2-70b-chat-hf. Hugging Face Model, 2024c. 70 billion parameter chat model from Meta's Llama 2 family. URL: https://huggingface.co/meta-llama/Llama-2-70b-chat-hf.">Meta AI, 2024c</a>]</span> in <a class="reference internal" href="#llama2-benchmark"><span class="std std-numref">Table 8.1</span></a> highlight the impact of model specialization. On the TruthfulQA <span id="id6">[<a class="reference internal" href="safety.html#id108" title="Stephanie Lin, Jacob Hilton, and Owain Evans. Truthfulqa: measuring how models mimic human falsehoods. 2022. URL: https://arxiv.org/abs/2109.07958, arXiv:2109.07958.">Lin <em>et al.</em>, 2022</a>]</span> and Toxigen <span id="id7">[<a class="reference internal" href="#id121" title="Khalid Alnajjar and others. Toxigen dataset. Papers with Code Dataset, 2024. Dataset for evaluating and mitigating toxic language generation in language models. URL: https://paperswithcode.com/dataset/toxigen.">Alnajjar and others, 2024</a>]</span> benchmarks measuring truthful and informative responses. We observe that the chat-optimized variants show substantially improved truthfulness. Similarly, on the ToxiGen benchmark measuring toxic content generation, Llama 2-Chat models demonstrate near-zero toxicity compared to base models’ 21-26% rates.</p>
 <table class="docutils align-center" id="llama2-benchmark">
-<caption><span class="caption-number">Table 7.1 </span><span class="caption-text">Benchmark results for Llama 2 family of models.</span><a class="headerlink" href="#llama2-benchmark" title="Permalink to this table">¶</a></caption>
+<caption><span class="caption-number">Table 8.1 </span><span class="caption-text">Benchmark results for Llama 2 family of models.</span><a class="headerlink" href="#llama2-benchmark" title="Permalink to this table">¶</a></caption>
 <thead>
 <tr class="row-odd"><th class="head"><p>Model</p></th>
 <th class="head"><p>Size</p></th>
@@ -411,34 +420,34 @@ <h3><a class="toc-backref" href="#id228" role="doc-backlink"><span class="sectio
 </ul>
 </section>
 <section id="performance-cost">
-<h3><a class="toc-backref" href="#id229" role="doc-backlink"><span class="section-number">7.2.2. </span>Performance &amp; Cost</a><a class="headerlink" href="#performance-cost" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id236" role="doc-backlink"><span class="section-number">8.2.2. </span>Performance &amp; Cost</a><a class="headerlink" href="#performance-cost" title="Permalink to this heading">¶</a></h3>
 <p>General benchmarks are useful for comparing models across different standard tasks. Open Source models are becoming more competitive with proprietary models with LLama, Qwen, DeepSeek and Mistral model families being some of the most powerful open source models available today.</p>
-<p>Qwen model family <span id="id9">[<a class="reference internal" href="#id161" title="Qwen, :, An Yang, Baosong Yang, Beichen Zhang, Binyuan Hui, Bo Zheng, Bowen Yu, Chengyuan Li, Dayiheng Liu, Fei Huang, Haoran Wei, Huan Lin, Jian Yang, Jianhong Tu, Jianwei Zhang, Jianxin Yang, Jiaxi Yang, Jingren Zhou, Junyang Lin, Kai Dang, Keming Lu, Keqin Bao, Kexin Yang, Le Yu, Mei Li, Mingfeng Xue, Pei Zhang, Qin Zhu, Rui Men, Runji Lin, Tianhao Li, Tingyu Xia, Xingzhang Ren, Xuancheng Ren, Yang Fan, Yang Su, Yichang Zhang, Yu Wan, Yuqiong Liu, Zeyu Cui, Zhenru Zhang, and Zihan Qiu. Qwen2.5 technical report. 2024. URL: https://arxiv.org/abs/2412.15115, arXiv:2412.15115.">Qwen <em>et al.</em>, 2024</a>]</span> emerged in 2024 as a model family achieving competitive performance with relatively smaller parameter counts compared to its competitors. The flagship Qwen2.5-72B-Instruct model demonstrates performance comparable to the much larger Llama-3-405B-Instruct while being about 5 times smaller. The models excel in specialized tasks like mathematics and coding, handle structured data effectively, and offer enhanced support for tool use and long-text generation as shown in <a class="reference internal" href="#qwen-perf"><span class="std std-numref">Fig. 7.3</span></a>.</p>
+<p>Qwen model family <span id="id9">[<a class="reference internal" href="#id161" title="Qwen, :, An Yang, Baosong Yang, Beichen Zhang, Binyuan Hui, Bo Zheng, Bowen Yu, Chengyuan Li, Dayiheng Liu, Fei Huang, Haoran Wei, Huan Lin, Jian Yang, Jianhong Tu, Jianwei Zhang, Jianxin Yang, Jiaxi Yang, Jingren Zhou, Junyang Lin, Kai Dang, Keming Lu, Keqin Bao, Kexin Yang, Le Yu, Mei Li, Mingfeng Xue, Pei Zhang, Qin Zhu, Rui Men, Runji Lin, Tianhao Li, Tingyu Xia, Xingzhang Ren, Xuancheng Ren, Yang Fan, Yang Su, Yichang Zhang, Yu Wan, Yuqiong Liu, Zeyu Cui, Zhenru Zhang, and Zihan Qiu. Qwen2.5 technical report. 2024. URL: https://arxiv.org/abs/2412.15115, arXiv:2412.15115.">Qwen <em>et al.</em>, 2024</a>]</span> emerged in 2024 as a model family achieving competitive performance with relatively smaller parameter counts compared to its competitors. The flagship Qwen2.5-72B-Instruct model demonstrates performance comparable to the much larger Llama-3-405B-Instruct while being about 5 times smaller. The models excel in specialized tasks like mathematics and coding, handle structured data effectively, and offer enhanced support for tool use and long-text generation as shown in <a class="reference internal" href="#qwen-perf"><span class="std std-numref">Fig. 8.3</span></a>.</p>
 <figure class="align-center" id="qwen-perf">
 <a class="reference internal image-reference" href="../_images/qwen_perf.png"><img alt="Qwen Performance" src="../_images/qwen_perf.png" style="width: 764.0px; height: 551.6px;" /></a>
 <figcaption>
-<p><span class="caption-number">Fig. 7.3 </span><span class="caption-text">Qwen Performance.</span><a class="headerlink" href="#qwen-perf" title="Permalink to this image">¶</a></p>
+<p><span class="caption-number">Fig. 8.3 </span><span class="caption-text">Qwen Performance.</span><a class="headerlink" href="#qwen-perf" title="Permalink to this image">¶</a></p>
 </figcaption>
 </figure>
-<p><a class="reference internal" href="#perf"><span class="std std-numref">Fig. 7.4</span></a> shows a comparison including reference proprietary models such as GPT-40, Gemini 1.5 Pro and Claude 3.5 Sonnet. Leading models vary per domain but all top ranking models are proprietary. However, open source models do show competitive performance with Qwen and LLama models leading the pack, overall.</p>
+<p><a class="reference internal" href="#perf"><span class="std std-numref">Fig. 8.4</span></a> shows a comparison including reference proprietary models such as GPT-40, Gemini 1.5 Pro and Claude 3.5 Sonnet. Leading models vary per domain but all top ranking models are proprietary. However, open source models do show competitive performance with Qwen and LLama models leading the pack, overall.</p>
 <figure class="align-center" id="perf">
 <a class="reference internal image-reference" href="../_images/perf_.png"><img alt="Performance Comparison including proprietary models." src="../_images/perf_.png" style="width: 1219.2px; height: 594.4px;" /></a>
 <figcaption>
-<p><span class="caption-number">Fig. 7.4 </span><span class="caption-text">Performance Comparison including proprietary models.</span><a class="headerlink" href="#perf" title="Permalink to this image">¶</a></p>
+<p><span class="caption-number">Fig. 8.4 </span><span class="caption-text">Performance Comparison including proprietary models.</span><a class="headerlink" href="#perf" title="Permalink to this image">¶</a></p>
 </figcaption>
 </figure>
-<p>Also from China, DeepSeek-V3 <span id="id10">[<a class="reference internal" href="#id126" title="DeepSeek AI. Deepseek-v3 technical report. Technical Report, 2024. URL: https://github.com/deepseek-ai/DeepSeek-V3/blob/main/DeepSeek_V3.pdf.">AI, 2024</a>]</span> represents a major breakthrough in open source language models, emerging as arguably as the most capable open source large language model available today. With 671 billion parameters and 37 billion active MoE (Mixture of Experts) parameters, it achieves performance on par with leading proprietary models like Claude 3.5 Sonnet and GPT 4o as shown in <a class="reference internal" href="#deep"><span class="std std-numref">Fig. 7.5</span></a>. The model demonstrates impressive efficiency metrics (see <a class="reference internal" href="#deep2"><span class="std std-numref">Fig. 7.6</span></a>), processing input tokens at <span class="math notranslate nohighlight">\(0.27 per million and output tokens at \)</span>1.1 per million, while maintaining a generation speed of 60 tokens per second (3x faster than DeepSeek-V2).</p>
-<p>What makes DeepSeek-V3 particularly remarkable is that these capabilities were achieved with a relatively modest training budget of just $5.5 million, used to train on 14.8 trillion tokens. This efficiency in training demonstrates the potential for open source models to compete with proprietary alternatives at a fraction of the cost. The model’s release marks a significant milestone in the democratization of advanced AI capabilities, challenging the dominance of proprietary models.</p>
+<p>Also from China, DeepSeek-V3 <span id="id10">[<a class="reference internal" href="#id126" title="DeepSeek. Deepseek-v3 technical report. Technical Report, 2024. URL: https://github.com/deepseek-ai/DeepSeek-V3/blob/main/DeepSeek_V3.pdf.">DeepSeek, 2024</a>]</span> represents a major breakthrough in open source language models, emerging as arguably the most capable open source large language model available as of the end of 2024. With 671 billion parameters and 37 billion active MoE (Mixture of Experts) parameters, it achieves performance on par with leading proprietary models like Claude 3.5 Sonnet and GPT 4o as shown in <a class="reference internal" href="#deep"><span class="std std-numref">Fig. 8.5</span></a>. The model demonstrates impressive cost efficiency metrics (see <a class="reference internal" href="#deep2"><span class="std std-numref">Fig. 8.6</span></a>), processing input tokens at <span class="math notranslate nohighlight">\(0.27 per million and output tokens at \)</span>1.1 per million, while maintaining a generation speed of 60 tokens per second (3x faster than DeepSeek-V2).</p>
+<p>What makes DeepSeek-V3 particularly remarkable is that these capabilities were achieved with a relatively modest training budget of just $5.5 million, used to train on 14.8 trillion tokens. This efficiency in training demonstrates the potential for open source models to compete with proprietary alternatives at a fraction of the cost. The model’s release marks a significant milestone in the democratization of advanced AI capabilities, challenging the dominance of proprietary models within big tech. One should be cautious though as the model has not yet been battle-tested in the wild but this is an exciting development demonstrating the potential of open source models to compete with proprietary alternatives.</p>
 <figure class="align-center" id="deep">
-<a class="reference internal image-reference" href="_static/local/deep.png"><img alt="DeepSeek-V3" src="_static/local/deep.png" /></a>
+<a class="reference internal image-reference" href="../_images/deep.jpeg"><img alt="DeepSeek-V3" src="../_images/deep.jpeg" style="width: 861.9px; height: 752.7px;" /></a>
 <figcaption>
-<p><span class="caption-number">Fig. 7.5 </span><span class="caption-text">DeepSeek-V3 Performance Comparison</span><a class="headerlink" href="#deep" title="Permalink to this image">¶</a></p>
+<p><span class="caption-number">Fig. 8.5 </span><span class="caption-text">DeepSeek-V3 Performance Comparison</span><a class="headerlink" href="#deep" title="Permalink to this image">¶</a></p>
 </figcaption>
 </figure>
 <figure class="align-center" id="deep2">
-<a class="reference internal image-reference" href="_static/local/deep2.png"><img alt="DeepSeek-V3 Cost Benefit Analysis" src="_static/local/deep2.png" /></a>
+<a class="reference internal image-reference" href="../_images/deep2.jpeg"><img alt="DeepSeek-V3 Cost Benefit Analysis" src="../_images/deep2.jpeg" style="width: 832.0px; height: 440.7px;" /></a>
 <figcaption>
-<p><span class="caption-number">Fig. 7.6 </span><span class="caption-text">DeepSeek-V3 Cost Benefit Analysis</span><a class="headerlink" href="#deep2" title="Permalink to this image">¶</a></p>
+<p><span class="caption-number">Fig. 8.6 </span><span class="caption-text">DeepSeek-V3 Cost Benefit Analysis</span><a class="headerlink" href="#deep2" title="Permalink to this image">¶</a></p>
 </figcaption>
 </figure>
 <p>While standard benchmarks provide valuable initial insights, they should be interpreted cautiously since models can be specifically optimized for these popular tests without necessarily performing well in target use cases. This necessitates developing custom evaluation frameworks with real-world validation - creating test datasets representing actual usage scenarios, defining metrics aligned with business objectives, and establishing clear baselines and improvement targets. Only through such rigorous testing can practitioners truly understand how well a model will perform in their specific context.</p>
@@ -460,26 +469,26 @@ <h3><a class="toc-backref" href="#id229" role="doc-backlink"><span class="sectio
 </ul>
 </li>
 </ul>
-<p><a class="reference internal" href="#p2"><span class="std std-numref">Fig. 7.7</span></a> shows a comparison of quality now with the added dimension of cost. Quality is measured as an average of scores from MMLU, GPQA, Math &amp; HumanEval benchmarks <span id="id11">[<a class="reference internal" href="#id59" title="Artificial Analysis. Methodology. https://artificialanalysis.ai/methodology, 2024. Accessed: December 22, 2024.">Analysis, 2024</a>]</span>. Price is a blend of Cost Per Input Token plus Input &amp; Cost Per Output Token (3:1 ratio). Reported numbers represent median across cloud providers <span id="id12">[<a class="reference internal" href="#id58" title="Artificial Analysis. Llm provider leaderboards. https://artificialanalysis.ai/leaderboards/providers, 2024. Accessed: 2024.">Analysis, 2024</a>]</span> supporting these models.</p>
+<p><a class="reference internal" href="#p2"><span class="std std-numref">Fig. 8.7</span></a> shows a comparison of quality now with the added dimension of cost. Quality is measured as an average of scores from MMLU, GPQA, Math &amp; HumanEval benchmarks <span id="id11">[<a class="reference internal" href="#id59" title="Artificial Analysis. Methodology. https://artificialanalysis.ai/methodology, 2024. Accessed: December 22, 2024.">Analysis, 2024</a>]</span>. Price is a blend of Cost Per Input Token plus Input &amp; Cost Per Output Token (3:1 ratio). Reported numbers represent median across cloud providers <span id="id12">[<a class="reference internal" href="#id58" title="Artificial Analysis. Llm provider leaderboards. https://artificialanalysis.ai/leaderboards/providers, 2024. Accessed: 2024.">Analysis, 2024</a>]</span> supporting these models.</p>
 <figure class="align-center" id="p2">
 <a class="reference internal image-reference" href="../_images/p2.png"><img alt="Performance Comparison including proprietary models." src="../_images/p2.png" style="width: 810.0px; height: 455.6px;" /></a>
 <figcaption>
-<p><span class="caption-number">Fig. 7.7 </span><span class="caption-text">Performance Comparison including proprietary models.</span><a class="headerlink" href="#p2" title="Permalink to this image">¶</a></p>
+<p><span class="caption-number">Fig. 8.7 </span><span class="caption-text">Performance Comparison including proprietary models.</span><a class="headerlink" href="#p2" title="Permalink to this image">¶</a></p>
 </figcaption>
 </figure>
 <p>We observe Qwen2.5 72B and Llama 3.3 70B offer the best value among Open Source models, providing high quality at a relatively affordable price comparable to GPT-4o mini, for instance. Meanwhile Nova Lite, Nova Micro, and Llama 3.1 8B demonstrate to be budget-friendly options catering to use cases where cost is a significant factor and some compromise on quality is acceptable.</p>
-<p>From <a class="reference internal" href="#p1"><span class="std std-numref">Fig. 7.8</span></a> we have evidence that output prices are higher than input prices. This reflects the greater computational resources typically required at inference time for output compared to processing input text (e.g. tokenization, encoding). We also observe a quite significant variation in pricing across different models. Prices range from a few cents per 1M tokens (e.g., Gemini 2.0 Flash, Nova Micro, Nova Lite) to several dollars per 1M tokens (e.g., Claude 3.5 Sonnet, GPT-4o). Mistral large 2 is the most expensive model at <span class="math notranslate nohighlight">\(2/\)</span>6 per 1M input/output tokens while Nova Micro family is the cheapest among Open Source options.</p>
+<p>From <a class="reference internal" href="#p1"><span class="std std-numref">Fig. 8.8</span></a> we have evidence that output prices are higher than input prices. This reflects the greater computational resources typically required at inference time for output compared to processing input text (e.g. tokenization, encoding). We also observe a quite significant variation in pricing across different models. Prices range from a few cents per 1M tokens (e.g., Gemini 2.0 Flash, Nova Micro, Nova Lite) to several dollars per 1M tokens (e.g., Claude 3.5 Sonnet, GPT-4o). Mistral large 2 is the most expensive model at <span class="math notranslate nohighlight">\(2/\)</span>6 per 1M input/output tokens while Nova Micro family is the cheapest among Open Source options.</p>
 <figure class="align-center" id="p1">
 <a class="reference internal image-reference" href="../_images/p1.png"><img alt="Input and Output Prices" src="../_images/p1.png" style="width: 804.8000000000001px; height: 341.6px;" /></a>
 <figcaption>
-<p><span class="caption-number">Fig. 7.8 </span><span class="caption-text">Input and Output Prices Comparison.</span><a class="headerlink" href="#p1" title="Permalink to this image">¶</a></p>
+<p><span class="caption-number">Fig. 8.8 </span><span class="caption-text">Input and Output Prices Comparison.</span><a class="headerlink" href="#p1" title="Permalink to this image">¶</a></p>
 </figcaption>
 </figure>
-<p>Latency figures in <a class="reference internal" href="#latency"><span class="std std-numref">Fig. 7.9</span></a> put GPT-4o (Nov ‘24) as the best performing model but Llama, Nova Micro, Phi and Mistral model families all have options with latency of half a second or better beating Gemini and Claude models considered as well as GPT-4o mini.</p>
+<p>Latency figures in <a class="reference internal" href="#latency"><span class="std std-numref">Fig. 8.9</span></a> put GPT-4o (Nov ‘24) as the best performing model but Llama, Nova Micro, Phi and Mistral model families all have options with latency of half a second or better beating Gemini and Claude models considered as well as GPT-4o mini.</p>
 <figure class="align-center" id="latency">
 <a class="reference internal image-reference" href="../_images/latency.png"><img alt="Latency Comparison" src="../_images/latency.png" style="width: 799.6px; height: 324.0px;" /></a>
 <figcaption>
-<p><span class="caption-number">Fig. 7.9 </span><span class="caption-text">Latency Comparison.</span><a class="headerlink" href="#latency" title="Permalink to this image">¶</a></p>
+<p><span class="caption-number">Fig. 8.9 </span><span class="caption-text">Latency Comparison.</span><a class="headerlink" href="#latency" title="Permalink to this image">¶</a></p>
 </figcaption>
 </figure>
 <p>This analysis provides a framework for evaluating key performance considerations when selecting an LLM. While the specific figures for cost, latency, and quality change frequently (often daily) as providers update their offerings and pricing, the fundamental tradeoffs remain relevant. When evaluating model suitability for a specific use case, practitioners should carefully consider:</p>
@@ -493,9 +502,9 @@ <h3><a class="toc-backref" href="#id229" role="doc-backlink"><span class="sectio
 <p>Regular re-evaluation of these metrics is recommended as the landscape evolves rapidly. What represents the optimal choice today may change as new models are released and existing ones are updated.</p>
 </section>
 <section id="licensing">
-<h3><a class="toc-backref" href="#id230" role="doc-backlink"><span class="section-number">7.2.3. </span>Licensing</a><a class="headerlink" href="#licensing" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id237" role="doc-backlink"><span class="section-number">8.2.3. </span>Licensing</a><a class="headerlink" href="#licensing" title="Permalink to this heading">¶</a></h3>
 <p>When evaluating open-source LLMs, it’s important to consider licensing and data usage policies. Some models may require attribution or commercial use licenses, while others may be more permissive. Additionally, ensure that the model’s training data is compatible with your intended use case and complies with relevant data protection laws.</p>
-<p>The licensing landscape for LLMs spans from highly permissive to custom and restricted usage. <a class="reference internal" href="#open-source-llms"><span class="std std-numref">Table 7.2</span></a> provides a summary of the licensing terms for some of the most popular open source LLMs. We observe two types of licenses:</p>
+<p>The licensing landscape for LLMs spans from highly permissive to custom and restricted usage. <a class="reference internal" href="#open-source-llms"><span class="std std-numref">Table 8.2</span></a> provides a summary of the licensing terms for some of the most popular open source LLMs. We observe two types of licenses:</p>
 <ul class="simple">
 <li><p><strong>Traditional Open Source</strong>:</p>
 <ul>
@@ -512,7 +521,7 @@ <h3><a class="toc-backref" href="#id230" role="doc-backlink"><span class="sectio
 </li>
 </ul>
 <table class="docutils align-center" id="open-source-llms">
-<caption><span class="caption-number">Table 7.2 </span><span class="caption-text">Open Source LLMs.</span><a class="headerlink" href="#open-source-llms" title="Permalink to this table">¶</a></caption>
+<caption><span class="caption-number">Table 8.2 </span><span class="caption-text">Open Source LLMs.</span><a class="headerlink" href="#open-source-llms" title="Permalink to this table">¶</a></caption>
 <thead>
 <tr class="row-odd"><th class="head"><p>Creator</p></th>
 <th class="head"><p>LLM</p></th>
@@ -556,19 +565,19 @@ <h3><a class="toc-backref" href="#id230" role="doc-backlink"><span class="sectio
 <p>A significant advancement in open-source language model training data is HuggingFace’s release of the FineWeb datasets. In its first release <span id="id16">[<a class="reference internal" href="#id47" title="Guilherme Penedo, Hynek Kydlíček, Loubna Ben allal, Anton Lozhkov, Margaret Mitchell, Colin Raffel, Leandro Von Werra, and Thomas Wolf. The fineweb datasets: decanting the web for the finest text data at scale. 2024. URL: https://arxiv.org/abs/2406.17557, arXiv:2406.17557.">Penedo <em>et al.</em>, 2024</a>]</span>, FineWeb is made of a 15-trillion token dataset derived from 96 Common Crawl snapshots that produces better-performing LLMs than other open pretraining datasets. Additionally, data curation codebase and all of the models trained during our ablation experiments are made available. FineWeb is a fine example of an initiative that helps minimize the gap between proprietary and public knowledge.</p>
 </section>
 <section id="community-support">
-<h3><a class="toc-backref" href="#id231" role="doc-backlink"><span class="section-number">7.2.4. </span>Community Support</a><a class="headerlink" href="#community-support" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id238" role="doc-backlink"><span class="section-number">8.2.4. </span>Community Support</a><a class="headerlink" href="#community-support" title="Permalink to this heading">¶</a></h3>
 <p>Community support plays a vital role in the open-source LLM ecosystem. Active communities contribute to model development, provide technical assistance, and share valuable resources. When evaluating open-source LLMs, the strength and engagement of the community should be a key consideration, as it directly impacts the model’s long-term viability and practical utility.</p>
 <p>The popularity of different model families reflects their community adoption. In 2024, the Qwen and Llama families have emerged as clear favorites, with Qwen2.5-1.5B-Instruct alone representing 35% of total open source models downloads in 2024.</p>
 <figure class="align-center" id="downloads">
 <a class="reference internal image-reference" href="../_images/downloads.png"><img alt="Hugging Face Downloads" src="../_images/downloads.png" style="width: 1142.3999999999999px; height: 453.59999999999997px;" /></a>
 <figcaption>
-<p><span class="caption-number">Fig. 7.10 </span><span class="caption-text">Hugging Face Model Downloads in 2024 as of December 22 of the same year <span id="id17">[<a class="reference internal" href="#id46" title="Hugging Face. Open source ai year in review 2024. https://huggingface.co/spaces/huggingface/open-source-ai-year-in-review-2024, 2024t. Accessed: 2024.">Face, 2024t</a>]</span>.</span><a class="headerlink" href="#downloads" title="Permalink to this image">¶</a></p>
+<p><span class="caption-number">Fig. 8.10 </span><span class="caption-text">Hugging Face Model Downloads in 2024 as of December 22 of the same year <span id="id17">[<a class="reference internal" href="#id46" title="Hugging Face. Open source ai year in review 2024. https://huggingface.co/spaces/huggingface/open-source-ai-year-in-review-2024, 2024t. Accessed: 2024.">Face, 2024t</a>]</span>.</span><a class="headerlink" href="#downloads" title="Permalink to this image">¶</a></p>
 </figcaption>
 </figure>
 <p>Strong communities accelerate model innovation through collective effort. When developers and researchers collaborate on model development, they create a powerful ecosystem of continuous improvement. Through transparent sharing of findings, they enable rapid development of novel applications and specialized model variants for specific domains. This collaborative environment naturally leads to the establishment of best practices and frameworks that benefit the entire community. The success of this community-driven approach is evident in models like Qwen2.5-1.5B-Instruct, which has spawned 200+ derivative models through post-training adaptations <span id="id18">[<a class="reference internal" href="#id160" title="Qwen. Qwen2.5-1.5b-instruct. 2024b. Accessed: December 22, 2024. URL: https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct.">Qwen, 2024b</a>]</span>.</p>
 </section>
 <section id="customization">
-<h3><a class="toc-backref" href="#id232" role="doc-backlink"><span class="section-number">7.2.5. </span>Customization</a><a class="headerlink" href="#customization" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id239" role="doc-backlink"><span class="section-number">8.2.5. </span>Customization</a><a class="headerlink" href="#customization" title="Permalink to this heading">¶</a></h3>
 <p>Model customization is an important consideration when selecting an open-source LLM. Adapting and fine-tuning to specific use cases can significantly impact practical utility and performance in production environments.</p>
 <p>Model providers increasingly offer streamlined fine-tuning services. For example, Mistral demonstrates an accessible approach to model customization.
 The code below shows Mistral’s straightforward fine-tuning API. The example shows how to create and start a fine-tuning job with just a few lines of code. The fine-tuning job is configured with the base model “open-mistral-7b” and uses training and validation files from the Ultrachat dataset <span id="id19">[<a class="reference internal" href="#id157" title="Hugging Face. Ultrachat-200k dataset. 2024u. Accessed: 2024. URL: https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k.">Face, 2024u</a>]</span>. This API design makes it easy to experiment with model customization while maintaining control over the training process.</p>
@@ -590,7 +599,7 @@ <h3><a class="toc-backref" href="#id232" role="doc-backlink"><span class="sectio
 <span class="n">created_jobs</span>
 </pre></div>
 </div>
-<p>For more comprehensive customization needs, Hugging Face’s Transformer Reinforcement Learning (TRL) toolkit provides robust capabilities for model adaptation. Built on the Transformers library, TRL supports <span id="id20">[<a class="reference internal" href="#id178" title="Hugging Face. Trl. 2024d. TRL. URL: https://huggingface.co/docs/trl/en/index.">Face, 2024d</a>]</span>:</p>
+<p>For more comprehensive customization needs, Hugging Face’s Transformer Reinforcement Learning (TRL) toolkit provides robust capabilities for model adaptation. Built on the Transformers library, TRL supports <span id="id20">[<a class="reference internal" href="#id185" title="Hugging Face. Trl. 2024d. TRL. URL: https://huggingface.co/docs/trl/en/index.">Face, 2024d</a>]</span>:</p>
 <ul class="simple">
 <li><p>Supervised Fine-Tuning (SFT)</p></li>
 <li><p>Reward Modeling (RM)</p></li>
@@ -598,9 +607,9 @@ <h3><a class="toc-backref" href="#id232" role="doc-backlink"><span class="sectio
 <li><p>Direct Preference Optimization (DPO)</p></li>
 </ul>
 <p>In <a class="reference internal" href="alignment.html#alignment-case-study"><span class="std std-ref">Case Study: Aligning a Language Model to a Policy</span></a>, we will explore how to use TRL to fine-tune a model to align with user preferences.</p>
-<p>Successful model customization demands managing critical resources throughout the development lifecycle. This includes rigorous dataset preparation and validation to ensure high-quality training data, careful configuration of training infrastructure to optimize computational resources, systematic experimentation iterations while managing associated costs, comprehensive performance evaluation frameworks to measure improvements, and thoughtful deployment architecture planning to ensure smooth production integration. Of course, actual cost of storage and inference should be taken into consideration. <a class="reference internal" href="#mistral-costs"><span class="std std-numref">Table 7.3</span></a> shows as an example the cost of associated with fine-tuning Mistral models <span id="id21">[<a class="reference internal" href="#id45" title="Mistral AI. Mistral technology and pricing. https://mistral.ai/technology/#pricing, 2024a. Accessed: 2024.">AI, 2024a</a>]</span>.</p>
+<p>Successful model customization demands managing critical resources throughout the development lifecycle. This includes rigorous dataset preparation and validation to ensure high-quality training data, careful configuration of training infrastructure to optimize computational resources, systematic experimentation iterations while managing associated costs, comprehensive performance evaluation frameworks to measure improvements, and thoughtful deployment architecture planning to ensure smooth production integration. Of course, actual cost of storage and inference should be taken into consideration. <a class="reference internal" href="#mistral-costs"><span class="std std-numref">Table 8.3</span></a> shows as an example the cost of associated with fine-tuning Mistral models <span id="id21">[<a class="reference internal" href="#id45" title="Mistral AI. Mistral technology and pricing. https://mistral.ai/technology/#pricing, 2024a. Accessed: 2024.">AI, 2024a</a>]</span>.</p>
 <table class="docutils align-center" id="mistral-costs">
-<caption><span class="caption-number">Table 7.3 </span><span class="caption-text">Mistral fine-tuning costs as of December 22, 2024.</span><a class="headerlink" href="#mistral-costs" title="Permalink to this table">¶</a></caption>
+<caption><span class="caption-number">Table 8.3 </span><span class="caption-text">Mistral fine-tuning costs as of December 22, 2024.</span><a class="headerlink" href="#mistral-costs" title="Permalink to this table">¶</a></caption>
 <thead>
 <tr class="row-odd"><th class="head"><p>Model</p></th>
 <th class="head"><p>One-off training (/M tokens)</p></th>
@@ -666,10 +675,10 @@ <h3><a class="toc-backref" href="#id232" role="doc-backlink"><span class="sectio
 </section>
 </section>
 <section id="tools-for-local-llm-deployment">
-<h2><a class="toc-backref" href="#id233" role="doc-backlink"><span class="section-number">7.3. </span>Tools for Local LLM Deployment</a><a class="headerlink" href="#tools-for-local-llm-deployment" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id240" role="doc-backlink"><span class="section-number">8.3. </span>Tools for Local LLM Deployment</a><a class="headerlink" href="#tools-for-local-llm-deployment" title="Permalink to this heading">¶</a></h2>
 <p>Local LLM deployment tools generally fall into two categories: inference-focused tools that prioritize performance and programmability for technical users requiring production-grade deployments, and user interface (UI) tools that emphasize accessibility through graphical interfaces for non-technical users, trading some performance for ease of use and broader adoption. In the following sections we will explore some of these tools discussing their features, capabilities, and trade-offs.</p>
 <section id="serving-models">
-<h3><a class="toc-backref" href="#id234" role="doc-backlink"><span class="section-number">7.3.1. </span>Serving Models</a><a class="headerlink" href="#serving-models" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id241" role="doc-backlink"><span class="section-number">8.3.1. </span>Serving Models</a><a class="headerlink" href="#serving-models" title="Permalink to this heading">¶</a></h3>
 <p>Serving an LLM model involves making it available for inference by setting up infrastructure to process requests and manage resources efficiently. This serving layer handles several key responsibilities, from loading model weights and managing compute resources to processing requests and optimizing performance. Let’s examine the core components of model serving:</p>
 <ol class="arabic simple">
 <li><p><strong>Model Loading and Initialization</strong></p></li>
@@ -707,11 +716,11 @@ <h3><a class="toc-backref" href="#id234" role="doc-backlink"><span class="sectio
 <li><p>Managing concurrent requests and load balancing</p></li>
 <li><p>Monitoring system resource utilization</p></li>
 </ul>
-<p>The serving layer acts as the bridge between the LLM and applications while working on top of a hardware stack as shown in <a class="reference internal" href="#local-inference"><span class="std std-numref">Fig. 7.11</span></a>. Getting this layer right is crucial for building locally-served reliable AI-powered applications, as it directly impacts the end-user experience in terms of response times, reliability, and resource efficiency.</p>
+<p>The serving layer acts as the bridge between the LLM and applications while working on top of a hardware stack as shown in <a class="reference internal" href="#local-inference"><span class="std std-numref">Fig. 8.11</span></a>. Getting this layer right is crucial for building locally-served reliable AI-powered applications, as it directly impacts the end-user experience in terms of response times, reliability, and resource efficiency.</p>
 <figure class="align-center" id="local-inference">
 <a class="reference internal image-reference" href="../_images/local_inference.svg"><img alt="Local Inference Server" height="523" src="../_images/local_inference.svg" width="626" /></a>
 <figcaption>
-<p><span class="caption-number">Fig. 7.11 </span><span class="caption-text">Local Inference Server.</span><a class="headerlink" href="#local-inference" title="Permalink to this image">¶</a></p>
+<p><span class="caption-number">Fig. 8.11 </span><span class="caption-text">Local Inference Server.</span><a class="headerlink" href="#local-inference" title="Permalink to this image">¶</a></p>
 </figcaption>
 </figure>
 <p>Model inference can be performed on Open Source models using cloud solutions such as Groq, Cerebras Systems, and SambaNova Systems. Here, we limit our scope to Open Source solutions that enable inference on local machines which includes consumer hardware. We will cover the following:</p>
@@ -722,7 +731,7 @@ <h3><a class="toc-backref" href="#id234" role="doc-backlink"><span class="sectio
 </ul>
 <p>Let’s explore each of these options in detail.</p>
 <section id="llama-cpp">
-<h4><a class="toc-backref" href="#id235" role="doc-backlink"><span class="section-number">7.3.1.1. </span>LLama.cpp</a><a class="headerlink" href="#llama-cpp" title="Permalink to this heading">¶</a></h4>
+<h4><a class="toc-backref" href="#id242" role="doc-backlink"><span class="section-number">8.3.1.1. </span>LLama.cpp</a><a class="headerlink" href="#llama-cpp" title="Permalink to this heading">¶</a></h4>
 <p>LLama.cpp <span id="id24">[<a class="reference internal" href="#id133" title="Georgi Gerganov and contributors. Llama.cpp. GitHub Repository, 2024a. High-performance inference of LLaMA models in pure C/C++. URL: https://github.com/ggerganov/llama.cpp.">Gerganov and contributors, 2024a</a>]</span> is an MIT-licensed open source optimized implementation of the <strong>LLama</strong> model architecture designed to run efficiently on machines with limited memory.</p>
 <p>Originally developed by Georgi Gerganov and today counting with hundreds of contributors, this C/C++ LLama version provides a simplified interface and advanced features that allow language models to run locally without overwhelming systems. With the ability to run in resource-constrained environments, LLama.cpp makes powerful language models more accessible and practical for a variety of applications.</p>
 <p>In its “Manifesto” <span id="id25">[<a class="reference internal" href="#id110" title="Georgi Gerganov and others. Quantization of llama models - discussion. GitHub Discussion, 2023. Discussion thread about quantization techniques and tradeoffs in llama.cpp. URL: https://github.com/ggerganov/llama.cpp/discussions/205.">Gerganov and others, 2023</a>]</span>, the author highlights the significant potential in bringing AI from cloud to edge devices, emphasizing the importance of keeping development lightweight, experimental, and enjoyable rather than getting bogged down in complex engineering challenges. The author states a vision that emphasizes maintaining an exploratory, hacker-minded approach while building practical edge computing solutions highlighting the following core principles:</p>
@@ -917,7 +926,7 @@ <h4><a class="toc-backref" href="#id235" role="doc-backlink"><span class="sectio
 </div>
 </section>
 <section id="llamafile">
-<h4><a class="toc-backref" href="#id236" role="doc-backlink"><span class="section-number">7.3.1.2. </span>Llamafile</a><a class="headerlink" href="#llamafile" title="Permalink to this heading">¶</a></h4>
+<h4><a class="toc-backref" href="#id243" role="doc-backlink"><span class="section-number">8.3.1.2. </span>Llamafile</a><a class="headerlink" href="#llamafile" title="Permalink to this heading">¶</a></h4>
 <p>Developed by Occupy Wall Street’s former activist, Justine Tunney, Llamafile <span id="id33">[<a class="reference internal" href="#id112" title="Mozilla Ocho. Llamafile: distribute and run llms with a single file. GitHub Repository, 2024. Tool for packaging and distributing LLMs as self-contained executables. URL: https://github.com/Mozilla-Ocho/llamafile.">Mozilla Ocho, 2024</a>]</span> is an Appache 2.0 licensed open source tool that combines the power of LLama.cpp with <strong>Cosmopolitan Libc</strong>, a universal C standard library that allows creating portable executables compatible with multiple operating systems.</p>
 <p>In this way, Llamafile reduces all the complexity of LLMs to a single executable file (called a “llamafile”) that runs locally without installation. Key advantages of Llamafile over plain Llama.cpp include:</p>
 <ol class="arabic simple">
@@ -962,7 +971,7 @@ <h4><a class="toc-backref" href="#id236" role="doc-backlink"><span class="sectio
 <p>As a result, a model server is running on <a class="reference external" href="http://localhost:8080">http://localhost:8080</a>. And we can use it as demonstrated in the previous section.</p>
 </section>
 <section id="ollama">
-<h4><a class="toc-backref" href="#id237" role="doc-backlink"><span class="section-number">7.3.1.3. </span>Ollama</a><a class="headerlink" href="#ollama" title="Permalink to this heading">¶</a></h4>
+<h4><a class="toc-backref" href="#id244" role="doc-backlink"><span class="section-number">8.3.1.3. </span>Ollama</a><a class="headerlink" href="#ollama" title="Permalink to this heading">¶</a></h4>
 <p>Ollama is a lightweight, MIT-licensed open-source tool for running LLMs locally. It provides a simple interface for interacting with a wide range of language models, including popular models like Llama 3.1 and Llama 3.2. Ollama is designed to be easy to install and use, making it a popular choice for developers who want to run LLMs locally without the need for extensive setup or configuration. Ollama’s key advantages include:</p>
 <ol class="arabic simple">
 <li><p><strong>Model Management</strong></p></li>
@@ -1056,10 +1065,10 @@ <h4><a class="toc-backref" href="#id237" role="doc-backlink"><span class="sectio
 </div>
 </section>
 <section id="comparison">
-<h4><a class="toc-backref" href="#id238" role="doc-backlink"><span class="section-number">7.3.1.4. </span>Comparison</a><a class="headerlink" href="#comparison" title="Permalink to this heading">¶</a></h4>
-<p>Each solution offers distinct advantages and tradeoffs that make them suitable for different use cases. At a high-level, Ollama is the easiest to install and use and has become the most popular choice for your average use case, Llamafile is the easiest to distribute and a good choice when portability is a priority, and Llama.cpp is the most customizable and performant solution as summarized in <a class="reference internal" href="#feature-comparison-local"><span class="std std-numref">Table 7.4</span></a>.</p>
+<h4><a class="toc-backref" href="#id245" role="doc-backlink"><span class="section-number">8.3.1.4. </span>Comparison</a><a class="headerlink" href="#comparison" title="Permalink to this heading">¶</a></h4>
+<p>Each solution offers distinct advantages and tradeoffs that make them suitable for different use cases. At a high-level, Ollama is the easiest to install and use and has become the most popular choice for your average use case, Llamafile is the easiest to distribute and a good choice when portability is a priority, and Llama.cpp is the most customizable and performant solution as summarized in <a class="reference internal" href="#feature-comparison-local"><span class="std std-numref">Table 8.4</span></a>.</p>
 <table class="docutils align-center" id="feature-comparison-local">
-<caption><span class="caption-number">Table 7.4 </span><span class="caption-text">lama.cpp vs Ollama vs Llamafile Comparison</span><a class="headerlink" href="#feature-comparison-local" title="Permalink to this table">¶</a></caption>
+<caption><span class="caption-number">Table 8.4 </span><span class="caption-text">lama.cpp vs Ollama vs Llamafile Comparison</span><a class="headerlink" href="#feature-comparison-local" title="Permalink to this table">¶</a></caption>
 <thead>
 <tr class="row-odd"><th class="head"><p>Feature</p></th>
 <th class="head"><p>Ollama</p></th>
@@ -1112,10 +1121,10 @@ <h4><a class="toc-backref" href="#id238" role="doc-backlink"><span class="sectio
 </section>
 </section>
 <section id="ui">
-<h3><a class="toc-backref" href="#id239" role="doc-backlink"><span class="section-number">7.3.2. </span>UI</a><a class="headerlink" href="#ui" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id246" role="doc-backlink"><span class="section-number">8.3.2. </span>UI</a><a class="headerlink" href="#ui" title="Permalink to this heading">¶</a></h3>
 <p>There is a growing number of UI tools for local LLM deployment that aim at providing a more user-friendly experience. Ranging from closed-source to open-source solutions across a range of features and capabilities. We will discuss LM Studio, Jan, and OpenWebUI.</p>
 <section id="lm-studio">
-<h4><a class="toc-backref" href="#id240" role="doc-backlink"><span class="section-number">7.3.2.1. </span>LM Studio</a><a class="headerlink" href="#lm-studio" title="Permalink to this heading">¶</a></h4>
+<h4><a class="toc-backref" href="#id247" role="doc-backlink"><span class="section-number">8.3.2.1. </span>LM Studio</a><a class="headerlink" href="#lm-studio" title="Permalink to this heading">¶</a></h4>
 <p>LM Studio <span id="id35">[<a class="reference internal" href="#id111" title="LM Studio. Lm studio - discover, download, and run local llms. Website, 2024. Desktop application for discovering, downloading and running local language models. URL: https://lmstudio.ai/.">LM Studio, 2024</a>]</span> is a closed-source GUI for running LLMs locally. In the context of local deployment, LM Studio positions itself as a more user-friendly, feature-rich solution compared to the other tools. It’s particularly valuable for developers transitioning from cloud APIs to local deployment, and for users who prefer graphical interfaces over command-line tools. Key Features of LM Studio include:</p>
 <ul class="simple">
 <li><p><strong>Model Parameter Customization</strong>: Allows adjusting temperature, maximum tokens, frequency penalty, and other settings</p></li>
@@ -1123,23 +1132,23 @@ <h4><a class="toc-backref" href="#id240" role="doc-backlink"><span class="sectio
 <li><p><strong>Cross-platform</strong>: Available on Linux, Mac, and Windows</p></li>
 <li><p><strong>AI Chat and Playground</strong>: Chat with LLMs and experiment with multiple models loaded simultaneously</p></li>
 </ul>
-<p><a class="reference internal" href="#lmstudio"><span class="std std-numref">Fig. 7.12</span></a> and <a class="reference internal" href="#lmstudio-server"><span class="std std-numref">Fig. 7.13</span></a> show LM Studio’s chat interface and server, respectively.</p>
+<p><a class="reference internal" href="#lmstudio"><span class="std std-numref">Fig. 8.12</span></a> and <a class="reference internal" href="#lmstudio-server"><span class="std std-numref">Fig. 8.13</span></a> show LM Studio’s chat interface and server, respectively.</p>
 <figure class="align-center" id="lmstudio">
 <a class="reference internal image-reference" href="../_images/lmstudio.png"><img alt="LM Studio" src="../_images/lmstudio.png" style="width: 858.6px; height: 581.4px;" /></a>
 <figcaption>
-<p><span class="caption-number">Fig. 7.12 </span><span class="caption-text">LM Studio Chat Interface.</span><a class="headerlink" href="#lmstudio" title="Permalink to this image">¶</a></p>
+<p><span class="caption-number">Fig. 8.12 </span><span class="caption-text">LM Studio Chat Interface.</span><a class="headerlink" href="#lmstudio" title="Permalink to this image">¶</a></p>
 </figcaption>
 </figure>
 <figure class="align-center" id="lmstudio-server">
 <a class="reference internal image-reference" href="../_images/lmstudio_server.png"><img alt="LM Studio Server" src="../_images/lmstudio_server.png" style="width: 945.0px; height: 585.6px;" /></a>
 <figcaption>
-<p><span class="caption-number">Fig. 7.13 </span><span class="caption-text">LM Studio Server.</span><a class="headerlink" href="#lmstudio-server" title="Permalink to this image">¶</a></p>
+<p><span class="caption-number">Fig. 8.13 </span><span class="caption-text">LM Studio Server.</span><a class="headerlink" href="#lmstudio-server" title="Permalink to this image">¶</a></p>
 </figcaption>
 </figure>
 <p>One important feature of LM Studio is that it provides machine specification verification capabilities, checking computer specifications like GPU and memory to report compatible models therefore helping users choose the right model. It also includes a local inference server for developers that allows setting up a local HTTP server similar to OpenAI’s API. Importantly, LM Studio’s OpenAI API compatibility is a particularly strong feature for developers looking to move their applications from cloud to local deployment with minimal code changes.</p>
 </section>
 <section id="jan">
-<h4><a class="toc-backref" href="#id241" role="doc-backlink"><span class="section-number">7.3.2.2. </span>Jan</a><a class="headerlink" href="#jan" title="Permalink to this heading">¶</a></h4>
+<h4><a class="toc-backref" href="#id248" role="doc-backlink"><span class="section-number">8.3.2.2. </span>Jan</a><a class="headerlink" href="#jan" title="Permalink to this heading">¶</a></h4>
 <p>Jan is an open source ChatGPT-alternative that runs local models. Its model’s library contains popular LLMs like Llama, Gemma, Mistral, or Qwen. Key Features of Jan include:</p>
 <ol class="arabic simple">
 <li><p><strong>User-Friendly Interface</strong>: Run AI models with just a few clicks</p></li>
@@ -1148,16 +1157,16 @@ <h4><a class="toc-backref" href="#id241" role="doc-backlink"><span class="sectio
 <li><p><strong>Model Hub Integration</strong>: Easy access to various models with ease of import from LM Studio</p></li>
 <li><p><strong>Cross-Platform Support</strong>: Works across different operating systems</p></li>
 </ol>
-<p>Jan has a default C++ inference server built on top of llama.cpp and provides an OpenAI-compatible API. Jan natively supports GGUF (through a llama.cpp engine) and TensorRT (through a TRT-LLM engine). HuggingFace models can be downloaded directly using the model’s ID or URL. User can optionally use cloud-based models (e.g. GPT, Claude models). <a class="reference internal" href="#id36"><span class="std std-numref">Fig. 7.14</span></a> shows Jan’s chat interface.</p>
+<p>Jan has a default C++ inference server built on top of llama.cpp and provides an OpenAI-compatible API. Jan natively supports GGUF (through a llama.cpp engine) and TensorRT (through a TRT-LLM engine). HuggingFace models can be downloaded directly using the model’s ID or URL. User can optionally use cloud-based models (e.g. GPT, Claude models). <a class="reference internal" href="#id36"><span class="std std-numref">Fig. 8.14</span></a> shows Jan’s chat interface.</p>
 <figure class="align-center" id="id36">
 <a class="reference internal image-reference" href="../_images/jan.png"><img alt="Jan" src="../_images/jan.png" style="width: 665.0px; height: 466.5px;" /></a>
 <figcaption>
-<p><span class="caption-number">Fig. 7.14 </span><span class="caption-text">Jan Chat Interface.</span><a class="headerlink" href="#id36" title="Permalink to this image">¶</a></p>
+<p><span class="caption-number">Fig. 8.14 </span><span class="caption-text">Jan Chat Interface.</span><a class="headerlink" href="#id36" title="Permalink to this image">¶</a></p>
 </figcaption>
 </figure>
 </section>
 <section id="open-webui">
-<h4><a class="toc-backref" href="#id242" role="doc-backlink"><span class="section-number">7.3.2.3. </span>Open WebUI</a><a class="headerlink" href="#open-webui" title="Permalink to this heading">¶</a></h4>
+<h4><a class="toc-backref" href="#id249" role="doc-backlink"><span class="section-number">8.3.2.3. </span>Open WebUI</a><a class="headerlink" href="#open-webui" title="Permalink to this heading">¶</a></h4>
 <p>Open WebUI is an open-source web interface designed to enhance the local AI model experience, particularly for Ollama and OpenAI-compatible APIs. It aims to provide enterprise-grade features while maintaining user-friendliness. OpenWebUI’s core features include:</p>
 <ol class="arabic simple">
 <li><p><strong>Advanced User Interface</strong></p>
@@ -1187,20 +1196,20 @@ <h4><a class="toc-backref" href="#id242" role="doc-backlink"><span class="sectio
 </ul>
 </li>
 </ol>
-<p><a class="reference internal" href="#openwebui"><span class="std std-numref">Fig. 7.15</span></a> shows Open WebUI’s chat interface.</p>
+<p><a class="reference internal" href="#openwebui"><span class="std std-numref">Fig. 8.15</span></a> shows Open WebUI’s chat interface.</p>
 <figure class="align-center" id="openwebui">
 <a class="reference internal image-reference" href="../_images/openwebui.png"><img alt="Open WebUI" src="../_images/openwebui.png" style="width: 818.75px; height: 456.25px;" /></a>
 <figcaption>
-<p><span class="caption-number">Fig. 7.15 </span><span class="caption-text">Open WebUI Chat Interface.</span><a class="headerlink" href="#openwebui" title="Permalink to this image">¶</a></p>
+<p><span class="caption-number">Fig. 8.15 </span><span class="caption-text">Open WebUI Chat Interface.</span><a class="headerlink" href="#openwebui" title="Permalink to this image">¶</a></p>
 </figcaption>
 </figure>
 <p>While Open WebUI offers advanced capabilities including RAG and multi-model support, these features require more system resources than simpler alternatives. Open WebUI is likely to be adopted by enterprise users who require advanced features and a more user-friendly interface.</p>
 </section>
 <section id="id37">
-<h4><a class="toc-backref" href="#id243" role="doc-backlink"><span class="section-number">7.3.2.4. </span>Comparison</a><a class="headerlink" href="#id37" title="Permalink to this heading">¶</a></h4>
-<p>LM Studio excels at providing individual developers with a smooth transition from cloud APIs to local deployment, offering an intuitive interface and robust API compatibility, however it is closed-source. Jan focuses on simplicity and accessibility, making it ideal for personal use and basic deployments while maintaining open-source benefits. OpenWebUI makes additional features available to enterprise users and teams requiring advanced features like RAG, collaboration tools, and granular access controls, though this may come at the cost of increased complexity and resource requirements. We compare the three tools in <a class="reference internal" href="#feature-comparison-ui"><span class="std std-numref">Table 7.5</span></a>.</p>
+<h4><a class="toc-backref" href="#id250" role="doc-backlink"><span class="section-number">8.3.2.4. </span>Comparison</a><a class="headerlink" href="#id37" title="Permalink to this heading">¶</a></h4>
+<p>LM Studio excels at providing individual developers with a smooth transition from cloud APIs to local deployment, offering an intuitive interface and robust API compatibility, however it is closed-source. Jan focuses on simplicity and accessibility, making it ideal for personal use and basic deployments while maintaining open-source benefits. OpenWebUI makes additional features available to enterprise users and teams requiring advanced features like RAG, collaboration tools, and granular access controls, though this may come at the cost of increased complexity and resource requirements. We compare the three tools in <a class="reference internal" href="#feature-comparison-ui"><span class="std std-numref">Table 8.5</span></a>.</p>
 <table class="docutils align-center" id="feature-comparison-ui">
-<caption><span class="caption-number">Table 7.5 </span><span class="caption-text">LM Studio vs Jan vs OpenWebUI Comparison</span><a class="headerlink" href="#feature-comparison-ui" title="Permalink to this table">¶</a></caption>
+<caption><span class="caption-number">Table 8.5 </span><span class="caption-text">LM Studio vs Jan vs OpenWebUI Comparison</span><a class="headerlink" href="#feature-comparison-ui" title="Permalink to this table">¶</a></caption>
 <thead>
 <tr class="row-odd"><th class="head"><p>Feature Category</p></th>
 <th class="head"><p>LM Studio</p></th>
@@ -1265,7 +1274,7 @@ <h4><a class="toc-backref" href="#id243" role="doc-backlink"><span class="sectio
 </section>
 </section>
 <section id="case-study-the-effect-of-quantization-on-llm-performance">
-<h2><a class="toc-backref" href="#id244" role="doc-backlink"><span class="section-number">7.4. </span>Case Study: The Effect of Quantization on LLM Performance</a><a class="headerlink" href="#case-study-the-effect-of-quantization-on-llm-performance" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id251" role="doc-backlink"><span class="section-number">8.4. </span>Case Study: The Effect of Quantization on LLM Performance</a><a class="headerlink" href="#case-study-the-effect-of-quantization-on-llm-performance" title="Permalink to this heading">¶</a></h2>
 <p>This case study examines how different quantization <span id="id38">[<a class="reference internal" href="#id44" title="Hugging Face. Quantization in optimum. https://huggingface.co/docs/optimum/en/concept_guides/quantization, 2024s. Accessed: 2024.">Face, 2024s</a>]</span> levels affect the performance of language models running locally. Quantization is a crucial technique for reducing model size and memory footprint while enhancing inference speed, but it comes with potential tradeoffs in model quality. Understanding these tradeoffs is essential for practitioners deploying LLMs in resource-constrained environments.</p>
 <p>Using the Qwen 2.5 0.5B model as our baseline, we’ll compare four variants:</p>
 <ul class="simple">
@@ -1292,7 +1301,7 @@ <h2><a class="toc-backref" href="#id244" role="doc-backlink"><span class="sectio
 </ul>
 <p>While we will focus on the Qwen 2.5 0.5B model, the same analysis can be applied to other models. These insights will help practitioners make informed decisions about quantization strategies based on their specific requirements for model performance and resource usage.</p>
 <section id="prompts-dataset">
-<h3><a class="toc-backref" href="#id245" role="doc-backlink"><span class="section-number">7.4.1. </span>Prompts Dataset</a><a class="headerlink" href="#prompts-dataset" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id252" role="doc-backlink"><span class="section-number">8.4.1. </span>Prompts Dataset</a><a class="headerlink" href="#prompts-dataset" title="Permalink to this heading">¶</a></h3>
 <p>To evaluate the impact of quantization on model performance, we first need a set of prompts that will serve as input data for our experiments. We’ll construct a dataset from WikiText-2 <span id="id39">[<a class="reference internal" href="#id123" title="Salesforce. Wikitext dataset. Hugging Face Dataset, 2024. Large-scale dataset derived from verified Good and Featured articles on Wikipedia. URL: https://huggingface.co/datasets/Salesforce/wikitext.">Salesforce, 2024</a>]</span>, which contains Wikipedia excerpts.</p>
 <p>In our experiments, we will use a total of <code class="docutils literal notranslate"><span class="pre">NUM_PROMPTS</span></code> prompts that vary in length from <code class="docutils literal notranslate"><span class="pre">MIN_PROMPT_LENGTH</span></code> to <code class="docutils literal notranslate"><span class="pre">MAX_PROMPT_LENGTH</span></code> tokens. Using a fixed set of prompts ensures consistent evaluation across model variants and enables direct comparison of metrics like perplexity and throughput.</p>
 <div class="cell docutils container">
@@ -1356,19 +1365,19 @@ <h3><a class="toc-backref" href="#id245" role="doc-backlink"><span class="sectio
 </div>
 </section>
 <section id="quantization">
-<h3><a class="toc-backref" href="#id246" role="doc-backlink"><span class="section-number">7.4.2. </span>Quantization</a><a class="headerlink" href="#quantization" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id253" role="doc-backlink"><span class="section-number">8.4.2. </span>Quantization</a><a class="headerlink" href="#quantization" title="Permalink to this heading">¶</a></h3>
 <p>We can quantize a model using the <code class="docutils literal notranslate"><span class="pre">llama-quantize</span></code> CLI. For instance, to quantize the Qwen 2.5 0.5B model to Q4_K, we can run the following command:</p>
 <div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>./llama-quantize<span class="w"> </span>-m<span class="w"> </span>./models/qwen2.5-0.5b-instruct-fp16.gguf<span class="w"> </span>./models/qwen2.5-0.5b-instruct-q8_0.gguf<span class="w"> </span>Q4_K
 </pre></div>
 </div>
-<p><a class="reference internal" href="#quantization-levels"><span class="std std-numref">Table 7.6</span></a> describes the key quantization levels used in this study <span id="id40">[<a class="reference internal" href="#id130" title="Hugging Face. Gguf quantization types. Online Documentation, 2024w. Documentation on different quantization types available for GGUF models. URL: https://huggingface.co/docs/hub/gguf#quantization-types.">Hugging Face, 2024w</a>]</span>, where:</p>
+<p><a class="reference internal" href="#quantization-levels"><span class="std std-numref">Table 8.6</span></a> describes the key quantization levels used in this study <span id="id40">[<a class="reference internal" href="#id130" title="Hugging Face. Gguf quantization types. Online Documentation, 2024w. Documentation on different quantization types available for GGUF models. URL: https://huggingface.co/docs/hub/gguf#quantization-types.">Hugging Face, 2024w</a>]</span>, where:</p>
 <ul class="simple">
 <li><p>q is the quantized value</p></li>
 <li><p>block_scale is the scaling factor for the block (with bit width in parentheses)</p></li>
 <li><p>block_min is the block minimum value (with bit width in parentheses)</p></li>
 </ul>
 <table class="docutils align-center" id="quantization-levels">
-<caption><span class="caption-number">Table 7.6 </span><span class="caption-text">Quantization Levels</span><a class="headerlink" href="#quantization-levels" title="Permalink to this table">¶</a></caption>
+<caption><span class="caption-number">Table 8.6 </span><span class="caption-text">Quantization Levels</span><a class="headerlink" href="#quantization-levels" title="Permalink to this table">¶</a></caption>
 <thead>
 <tr class="row-odd"><th class="head"><p>Quantization</p></th>
 <th class="head"><p>Description</p></th>
@@ -1397,7 +1406,7 @@ <h3><a class="toc-backref" href="#id246" role="doc-backlink"><span class="sectio
 <p>Each quantization level represents a different tradeoff between model size and accuracy. Q2_K provides the highest compression but potentially lower accuracy, while Q6_K maintains better accuracy at the cost of larger model size. The base model is 16-bit standard IEEE 754 half-precision floating-point number.</p>
 </section>
 <section id="benchmarking">
-<h3><a class="toc-backref" href="#id247" role="doc-backlink"><span class="section-number">7.4.3. </span>Benchmarking</a><a class="headerlink" href="#benchmarking" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id254" role="doc-backlink"><span class="section-number">8.4.3. </span>Benchmarking</a><a class="headerlink" href="#benchmarking" title="Permalink to this heading">¶</a></h3>
 <p>We will measure quantized model “quality” by means of perplexity and KL Divergence.</p>
 <p><strong>Perplexity</strong></p>
 <p>Perplexity is a common metric for evaluating language models that measures how well a model predicts a sample of text. Lower perplexity indicates better prediction (less “perplexed” by the text).</p>
@@ -1438,24 +1447,24 @@ <h3><a class="toc-backref" href="#id247" role="doc-backlink"><span class="sectio
 <p>We perform this process for each quantization level studied (Q2_K, Q4_K, Q6_K).</p>
 </section>
 <section id="results">
-<h3><a class="toc-backref" href="#id248" role="doc-backlink"><span class="section-number">7.4.4. </span>Results</a><a class="headerlink" href="#results" title="Permalink to this heading">¶</a></h3>
-<p>The KL divergence and perplexity results in <a class="reference internal" href="#ppl1"><span class="std std-numref">Fig. 7.17</span></a> and <a class="reference internal" href="#ppl2"><span class="std std-numref">Fig. 7.16</span></a> provide insights into model quality across different quantization levels. Q6 maintains near-perfect correlation (99.90%) with the base model and minimal KL divergence (0.004), indicating very close distribution matching. Q2’s higher KL divergence (0.112) and lower correlation (98.31%) quantify its increased deviation from the base model’s behavior.</p>
+<h3><a class="toc-backref" href="#id255" role="doc-backlink"><span class="section-number">8.4.4. </span>Results</a><a class="headerlink" href="#results" title="Permalink to this heading">¶</a></h3>
+<p>The KL divergence and perplexity results in <a class="reference internal" href="#ppl1"><span class="std std-numref">Fig. 8.17</span></a> and <a class="reference internal" href="#ppl2"><span class="std std-numref">Fig. 8.16</span></a> provide insights into model quality across different quantization levels. Q6 maintains near-perfect correlation (99.90%) with the base model and minimal KL divergence (0.004), indicating very close distribution matching. Q2’s higher KL divergence (0.112) and lower correlation (98.31%) quantify its increased deviation from the base model’s behavior.</p>
 <figure class="align-center" id="ppl2">
 <a class="reference internal image-reference" href="../_images/ppl2.png"><img alt="Perplexity" src="../_images/ppl2.png" style="width: 897.5px; height: 474.5px;" /></a>
 <figcaption>
-<p><span class="caption-number">Fig. 7.16 </span><span class="caption-text">KL Divergence results for Quantization Q2, Q4, and Q6 quantized models.</span><a class="headerlink" href="#ppl2" title="Permalink to this image">¶</a></p>
+<p><span class="caption-number">Fig. 8.16 </span><span class="caption-text">KL Divergence results for Quantization Q2, Q4, and Q6 quantized models.</span><a class="headerlink" href="#ppl2" title="Permalink to this image">¶</a></p>
 </figcaption>
 </figure>
 <figure class="align-center" id="ppl1">
 <a class="reference internal image-reference" href="../_images/ppl1.png"><img alt="Perplexity" src="../_images/ppl1.png" style="width: 451.0px; height: 455.0px;" /></a>
 <figcaption>
-<p><span class="caption-number">Fig. 7.17 </span><span class="caption-text">Perplexity results for Quantization Q2, Q4, and Q6 quantized models.</span><a class="headerlink" href="#ppl1" title="Permalink to this image">¶</a></p>
+<p><span class="caption-number">Fig. 8.17 </span><span class="caption-text">Perplexity results for Quantization Q2, Q4, and Q6 quantized models.</span><a class="headerlink" href="#ppl1" title="Permalink to this image">¶</a></p>
 </figcaption>
 </figure>
-<p>From <a class="reference internal" href="#quantization-benchmarks"><span class="std std-numref">Table 7.7</span></a>, we observe that the Q2 model achieves the smallest size at 390 MiB
+<p>From <a class="reference internal" href="#quantization-benchmarks"><span class="std std-numref">Table 8.7</span></a>, we observe that the Q2 model achieves the smallest size at 390 MiB
 (67% reduction from base) with prompt throughput of 81 tokens/s, but has the highest perplexity degradation at 10.36%. The Q4 model offers a better balance, with good size savings (60% reduction) and only 3.5% perplexity loss. Q6 comes closest to matching the base model’s performance with just 0.93% perplexity degradation, while still providing 47% size reduction.</p>
 <table class="docutils align-center" id="quantization-benchmarks">
-<caption><span class="caption-number">Table 7.7 </span><span class="caption-text">Quantization Benchmarks</span><a class="headerlink" href="#quantization-benchmarks" title="Permalink to this table">¶</a></caption>
+<caption><span class="caption-number">Table 8.7 </span><span class="caption-text">Quantization Benchmarks</span><a class="headerlink" href="#quantization-benchmarks" title="Permalink to this table">¶</a></caption>
 <thead>
 <tr class="row-odd"><th class="head"><p>Model</p></th>
 <th class="head"><p>Size (MiB)</p></th>
@@ -1507,16 +1516,16 @@ <h3><a class="toc-backref" href="#id248" role="doc-backlink"><span class="sectio
 <li><p><code class="docutils literal notranslate"><span class="pre">-m</span></code>: Specify model paths for base FP16 model and Q2, Q4, Q6 quantized versions</p></li>
 </ul>
 <p>This runs text generation on a default benchmark of 128 tokens generation length (configurable via <code class="docutils literal notranslate"><span class="pre">-g</span></code> parameter).</p>
-<p>Results in <a class="reference internal" href="#tg"><span class="std std-numref">Fig. 7.18</span></a> indicate the base model delivers text generation performance at 19.73 tokens/s, while the most aggressively quantized Q2 model (390.28 MiB) delivers the highest throughput at 42.62 tokens/s, representing a 2.16x speedup. This pattern continues across Q4 (462.96 MiB, 38.38 tokens/s) and Q6 (614.58 MiB, 35.43 tokens/s), which presents a 1.85x and 1.79x speedup, respectively.</p>
+<p>Results in <a class="reference internal" href="#tg"><span class="std std-numref">Fig. 8.18</span></a> indicate the base model delivers text generation performance at 19.73 tokens/s, while the most aggressively quantized Q2 model (390.28 MiB) delivers the highest throughput at 42.62 tokens/s, representing a 2.16x speedup. This pattern continues across Q4 (462.96 MiB, 38.38 tokens/s) and Q6 (614.58 MiB, 35.43 tokens/s), which presents a 1.85x and 1.79x speedup, respectively.</p>
 <figure class="align-center" id="tg">
 <a class="reference internal image-reference" href="../_images/tg.png"><img alt="Text Generation Performance" src="../_images/tg.png" style="width: 873.0px; height: 433.5px;" /></a>
 <figcaption>
-<p><span class="caption-number">Fig. 7.18 </span><span class="caption-text">Text Generation Performance results for Quantization Q2, Q4, Q6 and base models.</span><a class="headerlink" href="#tg" title="Permalink to this image">¶</a></p>
+<p><span class="caption-number">Fig. 8.18 </span><span class="caption-text">Text Generation Performance results for Quantization Q2, Q4, Q6 and base models.</span><a class="headerlink" href="#tg" title="Permalink to this image">¶</a></p>
 </figcaption>
 </figure>
-<p>Benchmarking was performed on Ubuntu 24.04 LTS for x86_64-linux-gnu on commodity hardware (<a class="reference internal" href="#benchmarking-hardware"><span class="std std-numref">Table 7.8</span></a>) with no dedicated GPU demonstrating the feasibility of running LLMs locally by nearly everyone with a personal computer thanks to LLama.cpp.</p>
+<p>Benchmarking was performed on Ubuntu 24.04 LTS for x86_64-linux-gnu on commodity hardware (<a class="reference internal" href="#benchmarking-hardware"><span class="std std-numref">Table 8.8</span></a>) with no dedicated GPU demonstrating the feasibility of running LLMs locally by nearly everyone with a personal computer thanks to LLama.cpp.</p>
 <table class="docutils align-center" id="benchmarking-hardware">
-<caption><span class="caption-number">Table 7.8 </span><span class="caption-text">Benchmarking Hardware</span><a class="headerlink" href="#benchmarking-hardware" title="Permalink to this table">¶</a></caption>
+<caption><span class="caption-number">Table 8.8 </span><span class="caption-text">Benchmarking Hardware</span><a class="headerlink" href="#benchmarking-hardware" title="Permalink to this table">¶</a></caption>
 <thead>
 <tr class="row-odd"><th class="head"><p>Device</p></th>
 <th class="head"><p>Description</p></th>
@@ -1536,14 +1545,14 @@ <h3><a class="toc-backref" href="#id248" role="doc-backlink"><span class="sectio
 </table>
 </section>
 <section id="takeaways">
-<h3><a class="toc-backref" href="#id249" role="doc-backlink"><span class="section-number">7.4.5. </span>Takeaways</a><a class="headerlink" href="#takeaways" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id256" role="doc-backlink"><span class="section-number">8.4.5. </span>Takeaways</a><a class="headerlink" href="#takeaways" title="Permalink to this heading">¶</a></h3>
 <p>The quantization analysis of the Qwen 2.5 0.5B model demonstrates a clear trade-off among model size, inference speed, and prediction quality. While the base model (1170 MiB) maintains the highest accuracy it operates at the lowest text generation and prompt throughput of 19.73 tokens/s and 94.39 tokens/s, respectively. In contrast, the Q2_K quantization achieves significant size reduction (67%) and the highest throughput (42.62 tokens/s), but exhibits the largest quality degradation with a 10.36% perplexity increase and lowest KL divergence among quantized models. Q4_K emerges as a compelling middle ground, offering substantial size reduction (60%) and strong text generation and prompt throughput performance (38.38 tokens/s and 77.08 tokens/s, respectively), while maintaining good model quality with only 3.5% perplexity degradation and middle-ground KL divergence level.</p>
 <p>These results, achieved on commodity CPU hardware, demonstrate that quantization can significantly improve inference speed and reduce model size while maintaining acceptable quality thresholds, making large language models more accessible for resource-constrained environments.</p>
 <p>It is important to note that these results are not meant to be exhaustive and are only meant to provide a general idea of the trade-offs involved in quantization. Targeted benchmarks should be performed for specific use cases and models to best reflect real-world performance.</p>
 </section>
 </section>
 <section id="conclusion">
-<h2><a class="toc-backref" href="#id250" role="doc-backlink"><span class="section-number">7.5. </span>Conclusion</a><a class="headerlink" href="#conclusion" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id257" role="doc-backlink"><span class="section-number">8.5. </span>Conclusion</a><a class="headerlink" href="#conclusion" title="Permalink to this heading">¶</a></h2>
 <p>Running open source language models locally represents a compelling proposition in how we interact with AI technology. The transition from cloud-based to local deployment offers important advantages in terms of privacy, cost control, and customization flexibility, while introducing important technical considerations around resource management and performance optimization. The growing ecosystem of tools and frameworks, from low-level libraries like llama.cpp to user-friendly interfaces like LM Studio and Jan, has made local deployment increasingly accessible to both individual developers and organizations.</p>
 <p>Our case study demonstrated that quantization can significantly improve inference speed and reduce model size while maintaining acceptable quality thresholds, making large language models more accessible for resource-constrained environments. As demonstrated in our case study with the Qwen 2.5 0.5B model, practitioners can achieve significant reductions in model size and improvements in inference speed while maintaining acceptable performance levels. The Q4_K quantization scheme emerged as a particularly effective compromise, offering substantial size reduction (60%) and strong throughput while limiting quality degradation to just 3.5% in perplexity measures.</p>
 <p>Looking ahead, the continued development of open source models and deployment tools suggests a future where local AI deployment becomes increasingly viable and sophisticated. The success of open source models like Qwen and Llama, combined with improvements in local model serving and techniques couple with efficient small language models (SLMs), indicate that local deployment will likely play an increasingly important role in the AI landscape. However, practitioners must carefully evaluate their specific requirements across dimensions like task suitability, resource constraints, and performance needs when choosing between local and cloud-based deployment strategies.</p>
@@ -1560,12 +1569,8 @@ <h2><a class="toc-backref" href="#id250" role="doc-backlink"><span class="sectio
 </div>
 </section>
 <section id="references">
-<h2><a class="toc-backref" href="#id251" role="doc-backlink"><span class="section-number">7.6. </span>References</a><a class="headerlink" href="#references" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id258" role="doc-backlink"><span class="section-number">8.6. </span>References</a><a class="headerlink" href="#references" title="Permalink to this heading">¶</a></h2>
 <div class="docutils container" id="id41">
-<div class="citation" id="id126" role="doc-biblioentry">
-<span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id10">AI24</a><span class="fn-bracket">]</span></span>
-<p>DeepSeek AI. Deepseek-v3 technical report. Technical Report, 2024. URL: <a class="reference external" href="https://github.com/deepseek-ai/DeepSeek-V3/blob/main/DeepSeek_V3.pdf">https://github.com/deepseek-ai/DeepSeek-V3/blob/main/DeepSeek_V3.pdf</a>.</p>
-</div>
 <div class="citation" id="id155" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id15">AI4c</a><span class="fn-bracket">]</span></span>
 <p>Meta AI. The llama 3 herd of models. 2024c. URL: <a class="reference external" href="https://arxiv.org/abs/2407.21783">https://arxiv.org/abs/2407.21783</a>, <a class="reference external" href="https://arxiv.org/abs/2407.21783">arXiv:2407.21783</a>.</p>
@@ -1599,7 +1604,11 @@ <h2><a class="toc-backref" href="#id251" role="doc-backlink"><span class="sectio
 <span class="backrefs">(<a role="doc-backlink" href="#id30">1</a>,<a role="doc-backlink" href="#id32">2</a>)</span>
 <p>Andrei Betlen and contributors. Llama-cpp-python. GitHub Repository, 2024. Python bindings for llama.cpp library enabling high-performance inference of LLaMA models. URL: <a class="reference external" href="https://github.com/abetlen/llama-cpp-python">https://github.com/abetlen/llama-cpp-python</a>.</p>
 </div>
-<div class="citation" id="id178" role="doc-biblioentry">
+<div class="citation" id="id126" role="doc-biblioentry">
+<span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id10">Dee24</a><span class="fn-bracket">]</span></span>
+<p>DeepSeek. Deepseek-v3 technical report. Technical Report, 2024. URL: <a class="reference external" href="https://github.com/deepseek-ai/DeepSeek-V3/blob/main/DeepSeek_V3.pdf">https://github.com/deepseek-ai/DeepSeek-V3/blob/main/DeepSeek_V3.pdf</a>.</p>
+</div>
+<div class="citation" id="id185" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id20">Fac4d</a><span class="fn-bracket">]</span></span>
 <p>Hugging Face. Trl. 2024d. TRL. URL: <a class="reference external" href="https://huggingface.co/docs/trl/en/index">https://huggingface.co/docs/trl/en/index</a>.</p>
 </div>
@@ -1662,7 +1671,7 @@ <h2><a class="toc-backref" href="#id251" role="doc-backlink"><span class="sectio
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id13">Rev24</a><span class="fn-bracket">]</span></span>
 <p>Harvard Law Review. Nyt v. openai: the times's about-face. <a class="reference external" href="https://harvardlawreview.org/blog/2024/04/nyt-v-openai-the-timess-about-face/">https://harvardlawreview.org/blog/2024/04/nyt-v-openai-the-timess-about-face/</a>, 2024. Accessed: 2024.</p>
 </div>
-<div class="citation" id="id169" role="doc-biblioentry">
+<div class="citation" id="id176" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id4">TMS+23</a><span class="fn-bracket">]</span></span>
 <p>Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cristian Canton Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez, Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushkar Mishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing Ellen Tan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, and Thomas Scialom. Llama 2: open foundation and fine-tuned chat models. 2023. URL: <a class="reference external" href="https://arxiv.org/abs/2307.09288">https://arxiv.org/abs/2307.09288</a>, <a class="reference external" href="https://arxiv.org/abs/2307.09288">arXiv:2307.09288</a>.</p>
 </div>
@@ -1732,11 +1741,11 @@ <h2><a class="toc-backref" href="#id251" role="doc-backlink"><span class="sectio
             <div class="inner"><ul class="page-nav">
   <li class="prev">
     <a href="alignment.html"
-       title="previous chapter">← <span class="section-number">6. </span>Preference-Based Alignment</a>
+       title="previous chapter">← <span class="section-number">7. </span>Preference-Based Alignment</a>
   </li>
   <li class="next">
     <a href="cost.html"
-       title="next chapter"><span class="section-number">8. </span>The Falling Cost Paradox →</a>
+       title="next chapter"><span class="section-number">9. </span>The Falling Cost Paradox →</a>
   </li>
 </ul><div class="footer" role="contentinfo">
       &#169; Copyright Tharsis T. P. Souza, 2024.
diff --git a/tamingllms/_build/html/notebooks/safety.html b/tamingllms/_build/html/notebooks/safety.html
index 80cf5c6..8643adf 100644
--- a/tamingllms/_build/html/notebooks/safety.html
+++ b/tamingllms/_build/html/notebooks/safety.html
@@ -4,7 +4,7 @@
     <meta charset="utf-8">
     <meta name="viewport" content="width=device-width,initial-scale=1"><meta name="generator" content="Docutils 0.19: https://docutils.sourceforge.io/" />
 
-      <title>5. Safety</title>
+      <title>6. Safety</title>
     
           <link rel="stylesheet" href="../_static/pygments.css" type="text/css" />
           <link rel="stylesheet" href="../_static/theme.css " type="text/css" />
@@ -37,8 +37,8 @@
     
   <link rel="index" title="Index" href="../genindex.html" />
   <link rel="search" title="Search" href="../search.html" />
-  <link rel="next" title="6. Preference-Based Alignment" href="alignment.html" />
-  <link rel="prev" title="4. Structured Output" href="structured_output.html" /> 
+  <link rel="next" title="7. Preference-Based Alignment" href="alignment.html" />
+  <link rel="prev" title="5. Managing Input Data" href="input.html" /> 
   </head>
 
   <body>
@@ -154,6 +154,15 @@
             
 
             
+          </li>
+
+        
+          <li class="toctree-l1 ">
+            
+              <a href="input.html" class="reference internal ">Managing Input Data</a>
+            
+
+            
           </li>
 
         
@@ -229,18 +238,18 @@
   <ul class="breadcrumbs">
     <li><a href="../markdown/toc.html">Docs</a> &raquo;</li>
     
-    <li><span class="section-number">5. </span>Safety</li>
+    <li><span class="section-number">6. </span>Safety</li>
   </ul>
   
 
   <ul class="page-nav">
   <li class="prev">
-    <a href="structured_output.html"
-       title="previous chapter">← <span class="section-number">4. </span>Structured Output</a>
+    <a href="input.html"
+       title="previous chapter">← <span class="section-number">5. </span>Managing Input Data</a>
   </li>
   <li class="next">
     <a href="alignment.html"
-       title="next chapter"><span class="section-number">6. </span>Preference-Based Alignment →</a>
+       title="next chapter"><span class="section-number">7. </span>Preference-Based Alignment →</a>
   </li>
 </ul>
   
@@ -249,7 +258,7 @@
           <div class="content" role="main" v-pre>
             
   <section class="tex2jax_ignore mathjax_ignore" id="safety">
-<h1><a class="toc-backref" href="#id249" role="doc-backlink"><span class="section-number">5. </span>Safety</a><a class="headerlink" href="#safety" title="Permalink to this heading">¶</a></h1>
+<h1><a class="toc-backref" href="#id256" role="doc-backlink"><span class="section-number">6. </span>Safety</a><a class="headerlink" href="#safety" title="Permalink to this heading">¶</a></h1>
 <blockquote class="epigraph">
 <div><p>Move fast and be responsible.</p>
 <p class="attribution">—Andrew Ng</p>
@@ -257,124 +266,124 @@ <h1><a class="toc-backref" href="#id249" role="doc-backlink"><span class="sectio
 <nav class="contents" id="contents">
 <p class="topic-title">Contents</p>
 <ul class="simple">
-<li><p><a class="reference internal" href="#safety" id="id249">Safety</a></p>
+<li><p><a class="reference internal" href="#safety" id="id256">Safety</a></p>
 <ul>
-<li><p><a class="reference internal" href="#introduction" id="id250">Introduction</a></p></li>
-<li><p><a class="reference internal" href="#safety-risks" id="id251">Safety Risks</a></p>
+<li><p><a class="reference internal" href="#introduction" id="id257">Introduction</a></p></li>
+<li><p><a class="reference internal" href="#safety-risks" id="id258">Safety Risks</a></p>
 <ul>
-<li><p><a class="reference internal" href="#general-ai-safety-risks" id="id252">General AI Safety Risks</a></p>
+<li><p><a class="reference internal" href="#general-ai-safety-risks" id="id259">General AI Safety Risks</a></p>
 <ul>
-<li><p><a class="reference internal" href="#amplified-existing-harms-and-novel-risks" id="id253">Amplified Existing Harms and Novel Risks</a></p></li>
-<li><p><a class="reference internal" href="#risks-associated-with-autonomous-ai" id="id254">Risks Associated with Autonomous AI</a></p></li>
-<li><p><a class="reference internal" href="#exacerbating-factors" id="id255">Exacerbating Factors</a></p></li>
+<li><p><a class="reference internal" href="#amplified-existing-harms-and-novel-risks" id="id260">Amplified Existing Harms and Novel Risks</a></p></li>
+<li><p><a class="reference internal" href="#risks-associated-with-autonomous-ai" id="id261">Risks Associated with Autonomous AI</a></p></li>
+<li><p><a class="reference internal" href="#exacerbating-factors" id="id262">Exacerbating Factors</a></p></li>
 </ul>
 </li>
-<li><p><a class="reference internal" href="#llms-specific-safety-risks" id="id256">LLMs Specific Safety Risks</a></p></li>
+<li><p><a class="reference internal" href="#llms-specific-safety-risks" id="id263">LLMs Specific Safety Risks</a></p></li>
 </ul>
 </li>
-<li><p><a class="reference internal" href="#guidance" id="id257">Guidance</a></p>
+<li><p><a class="reference internal" href="#guidance" id="id264">Guidance</a></p>
 <ul>
-<li><p><a class="reference internal" href="#governments-organizations" id="id258">Governments &amp; Organizations</a></p></li>
-<li><p><a class="reference internal" href="#private-sector" id="id259">Private Sector</a></p>
+<li><p><a class="reference internal" href="#governments-organizations" id="id265">Governments &amp; Organizations</a></p></li>
+<li><p><a class="reference internal" href="#private-sector" id="id266">Private Sector</a></p>
 <ul>
-<li><p><a class="reference internal" href="#openai" id="id260">OpenAI</a></p></li>
-<li><p><a class="reference internal" href="#anthropic" id="id261">Anthropic</a></p></li>
-<li><p><a class="reference internal" href="#google" id="id262">Google</a></p></li>
+<li><p><a class="reference internal" href="#openai" id="id267">OpenAI</a></p></li>
+<li><p><a class="reference internal" href="#anthropic" id="id268">Anthropic</a></p></li>
+<li><p><a class="reference internal" href="#google" id="id269">Google</a></p></li>
 </ul>
 </li>
-<li><p><a class="reference internal" href="#rubrics" id="id263">Rubrics</a></p>
+<li><p><a class="reference internal" href="#rubrics" id="id270">Rubrics</a></p>
 <ul>
-<li><p><a class="reference internal" href="#mlcommons-ai-safety-benchmark" id="id264">MLCommons AI Safety Benchmark</a></p></li>
-<li><p><a class="reference internal" href="#centre-for-the-governance-of-ai-rubric" id="id265">Centre for the Governance of AI Rubric</a></p></li>
+<li><p><a class="reference internal" href="#mlcommons-ai-safety-benchmark" id="id271">MLCommons AI Safety Benchmark</a></p></li>
+<li><p><a class="reference internal" href="#centre-for-the-governance-of-ai-rubric" id="id272">Centre for the Governance of AI Rubric</a></p></li>
 </ul>
 </li>
-<li><p><a class="reference internal" href="#porquoi" id="id266">Porquoi</a></p></li>
+<li><p><a class="reference internal" href="#porquoi" id="id273">Porquoi</a></p></li>
 </ul>
 </li>
-<li><p><a class="reference internal" href="#approaches" id="id267">Approaches</a></p>
+<li><p><a class="reference internal" href="#approaches" id="id274">Approaches</a></p>
 <ul>
-<li><p><a class="reference internal" href="#red-teaming" id="id268">Red Teaming</a></p></li>
-<li><p><a class="reference internal" href="#constitutional-ai" id="id269">Constitutional AI</a></p></li>
-<li><p><a class="reference internal" href="#explainable-ai-xai" id="id270">Explainable AI (XAI)</a></p></li>
+<li><p><a class="reference internal" href="#red-teaming" id="id275">Red Teaming</a></p></li>
+<li><p><a class="reference internal" href="#constitutional-ai" id="id276">Constitutional AI</a></p></li>
+<li><p><a class="reference internal" href="#explainable-ai-xai" id="id277">Explainable AI (XAI)</a></p></li>
 </ul>
 </li>
-<li><p><a class="reference internal" href="#designing-a-safety-plan" id="id271">Designing a Safety Plan</a></p>
+<li><p><a class="reference internal" href="#designing-a-safety-plan" id="id278">Designing a Safety Plan</a></p>
 <ul>
-<li><p><a class="reference internal" href="#phase-1-policy-definition" id="id272">Phase 1. Policy Definition</a></p></li>
-<li><p><a class="reference internal" href="#phase-2-user-research-risk-identification" id="id273">Phase 2. User Research &amp; Risk Identification</a></p></li>
-<li><p><a class="reference internal" href="#phase-3-evaluation-framework" id="id274">Phase 3. Evaluation Framework</a></p></li>
-<li><p><a class="reference internal" href="#phase-4-safety-architecture-design" id="id275">Phase 4. Safety Architecture Design</a></p></li>
-<li><p><a class="reference internal" href="#phase-5-implementation-tools-selection" id="id276">Phase 5. Implementation &amp; Tools Selection</a></p></li>
-<li><p><a class="reference internal" href="#phase-6-go-to-market" id="id277">Phase 6. Go-to-Market</a></p></li>
-<li><p><a class="reference internal" href="#common-pitfalls" id="id278">Common Pitfalls</a></p></li>
+<li><p><a class="reference internal" href="#phase-1-policy-definition" id="id279">Phase 1. Policy Definition</a></p></li>
+<li><p><a class="reference internal" href="#phase-2-user-research-risk-identification" id="id280">Phase 2. User Research &amp; Risk Identification</a></p></li>
+<li><p><a class="reference internal" href="#phase-3-evaluation-framework" id="id281">Phase 3. Evaluation Framework</a></p></li>
+<li><p><a class="reference internal" href="#phase-4-safety-architecture-design" id="id282">Phase 4. Safety Architecture Design</a></p></li>
+<li><p><a class="reference internal" href="#phase-5-implementation-tools-selection" id="id283">Phase 5. Implementation &amp; Tools Selection</a></p></li>
+<li><p><a class="reference internal" href="#phase-6-go-to-market" id="id284">Phase 6. Go-to-Market</a></p></li>
+<li><p><a class="reference internal" href="#common-pitfalls" id="id285">Common Pitfalls</a></p></li>
 </ul>
 </li>
-<li><p><a class="reference internal" href="#technical-implementation-components" id="id279">Technical Implementation Components</a></p>
+<li><p><a class="reference internal" href="#technical-implementation-components" id="id286">Technical Implementation Components</a></p>
 <ul>
-<li><p><a class="reference internal" href="#benchmarks-datasets" id="id280">Benchmarks &amp; Datasets</a></p>
+<li><p><a class="reference internal" href="#benchmarks-datasets" id="id287">Benchmarks &amp; Datasets</a></p>
 <ul>
-<li><p><a class="reference internal" href="#salad-bench" id="id281">SALAD-Bench</a></p></li>
-<li><p><a class="reference internal" href="#truthfulqa" id="id282">TruthfulQA</a></p></li>
-<li><p><a class="reference internal" href="#harmbench" id="id283">HarmBench</a></p></li>
-<li><p><a class="reference internal" href="#safebench" id="id284">SafeBench</a></p></li>
+<li><p><a class="reference internal" href="#salad-bench" id="id288">SALAD-Bench</a></p></li>
+<li><p><a class="reference internal" href="#truthfulqa" id="id289">TruthfulQA</a></p></li>
+<li><p><a class="reference internal" href="#harmbench" id="id290">HarmBench</a></p></li>
+<li><p><a class="reference internal" href="#safebench" id="id291">SafeBench</a></p></li>
 </ul>
 </li>
-<li><p><a class="reference internal" href="#tools-techniques" id="id285">Tools &amp; Techniques</a></p>
+<li><p><a class="reference internal" href="#tools-techniques" id="id292">Tools &amp; Techniques</a></p>
 <ul>
-<li><p><a class="reference internal" href="#rules-based-safety-filtering" id="id286">Rules-Based Safety Filtering</a></p></li>
-<li><p><a class="reference internal" href="#llm-based-safety-filtering" id="id287">LLM-Based Safety Filtering</a></p></li>
-<li><p><a class="reference internal" href="#custom-moderation" id="id288">Custom Moderation</a></p></li>
+<li><p><a class="reference internal" href="#rules-based-safety-filtering" id="id293">Rules-Based Safety Filtering</a></p></li>
+<li><p><a class="reference internal" href="#llm-based-safety-filtering" id="id294">LLM-Based Safety Filtering</a></p></li>
+<li><p><a class="reference internal" href="#custom-moderation" id="id295">Custom Moderation</a></p></li>
 </ul>
 </li>
 </ul>
 </li>
-<li><p><a class="reference internal" href="#case-study-implementing-a-safety-filter" id="id289">Case Study: Implementing a Safety Filter</a></p>
+<li><p><a class="reference internal" href="#case-study-implementing-a-safety-filter" id="id296">Case Study: Implementing a Safety Filter</a></p>
 <ul>
-<li><p><a class="reference internal" href="#evals-dataset" id="id290">Evals Dataset</a></p>
+<li><p><a class="reference internal" href="#evals-dataset" id="id297">Evals Dataset</a></p>
 <ul>
-<li><p><a class="reference internal" href="#bad-samples" id="id291">Bad Samples</a></p></li>
-<li><p><a class="reference internal" href="#good-samples" id="id292">Good Samples</a></p></li>
+<li><p><a class="reference internal" href="#bad-samples" id="id298">Bad Samples</a></p></li>
+<li><p><a class="reference internal" href="#good-samples" id="id299">Good Samples</a></p></li>
 </ul>
 </li>
-<li><p><a class="reference internal" href="#safety-filters" id="id293">Safety Filters</a></p>
+<li><p><a class="reference internal" href="#safety-filters" id="id300">Safety Filters</a></p>
 <ul>
-<li><p><a class="reference internal" href="#llm-guard" id="id294">LLM-Guard</a></p></li>
-<li><p><a class="reference internal" href="#mistral-moderation-api" id="id295">Mistral Moderation API</a></p></li>
-<li><p><a class="reference internal" href="#openai-moderation-api" id="id296">OpenAI Moderation API</a></p></li>
-<li><p><a class="reference internal" href="#custom-judge-validator" id="id297">Custom Judge Validator</a></p></li>
+<li><p><a class="reference internal" href="#llm-guard" id="id301">LLM-Guard</a></p></li>
+<li><p><a class="reference internal" href="#mistral-moderation-api" id="id302">Mistral Moderation API</a></p></li>
+<li><p><a class="reference internal" href="#openai-moderation-api" id="id303">OpenAI Moderation API</a></p></li>
+<li><p><a class="reference internal" href="#custom-judge-validator" id="id304">Custom Judge Validator</a></p></li>
 </ul>
 </li>
-<li><p><a class="reference internal" href="#benchmarking" id="id298">Benchmarking</a></p></li>
-<li><p><a class="reference internal" href="#takeaways" id="id299">Takeaways</a></p></li>
+<li><p><a class="reference internal" href="#benchmarking" id="id305">Benchmarking</a></p></li>
+<li><p><a class="reference internal" href="#takeaways" id="id306">Takeaways</a></p></li>
 </ul>
 </li>
-<li><p><a class="reference internal" href="#conclusion" id="id300">Conclusion</a></p></li>
-<li><p><a class="reference internal" href="#citation" id="id301">Citation</a></p></li>
-<li><p><a class="reference internal" href="#references" id="id302">References</a></p></li>
+<li><p><a class="reference internal" href="#conclusion" id="id307">Conclusion</a></p></li>
+<li><p><a class="reference internal" href="#citation" id="id308">Citation</a></p></li>
+<li><p><a class="reference internal" href="#references" id="id309">References</a></p></li>
 </ul>
 </li>
 </ul>
 </nav>
 <section id="introduction">
-<h2><a class="toc-backref" href="#id250" role="doc-backlink"><span class="section-number">5.1. </span>Introduction</a><a class="headerlink" href="#introduction" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id257" role="doc-backlink"><span class="section-number">6.1. </span>Introduction</a><a class="headerlink" href="#introduction" title="Permalink to this heading">¶</a></h2>
 <p>Alongside their immense potential, LLMs also present significant safety risks and ethical challenges that demand careful consideration. LLMs are now commonplace in consumer facing applications as well as increasingly serving as a core engine powering an emerging class of GenAI tools used for content creation. Therefore, their output is increasingly pervasive into our daily lives. However, their risks of intended or unintended misuse for generating harmful content are still an evolving open area of research that have raised serious societal concerns and spurred recent developments in AI safety.</p>
-<p>Without proper safeguards, LLMs can generate harmful content and respond to malicious prompts in dangerous ways <span id="id1">[<a class="reference internal" href="#id205" title="Thomas Hartvigsen, Saadia Gabriel, Hamid Palangi, Maarten Sap, Dipankar Ray, and Ece Kamar. ToxiGen: a large-scale machine-generated dataset for adversarial and implicit hate speech detection. In Smaranda Muresan, Preslav Nakov, and Aline Villavicencio, editors, Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), 3309–3326. Dublin, Ireland, May 2022. Association for Computational Linguistics. URL: https://aclanthology.org/2022.acl-long.234, doi:10.18653/v1/2022.acl-long.234.">Hartvigsen <em>et al.</em>, 2022</a>, <a class="reference internal" href="#id204" title="OpenAI, Josh Achiam, Steven Adler, Sandhini Agarwal, Lama Ahmad, Ilge Akkaya, Florencia Leoni Aleman, Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, Red Avila, Igor Babuschkin, Suchir Balaji, Valerie Balcom, Paul Baltescu, Haiming Bao, Mohammad Bavarian, Jeff Belgum, Irwan Bello, Jake Berdine, Gabriel Bernadett-Shapiro, Christopher Berner, Lenny Bogdonoff, Oleg Boiko, Madelaine Boyd, Anna-Luisa Brakman, Greg Brockman, Tim Brooks, Miles Brundage, Kevin Button, Trevor Cai, Rosie Campbell, Andrew Cann, Brittany Carey, Chelsea Carlson, Rory Carmichael, Brooke Chan, Che Chang, Fotis Chantzis, Derek Chen, Sully Chen, Ruby Chen, Jason Chen, Mark Chen, Ben Chess, Chester Cho, Casey Chu, Hyung Won Chung, Dave Cummings, Jeremiah Currier, Yunxing Dai, Cory Decareaux, Thomas Degry, Noah Deutsch, Damien Deville, Arka Dhar, David Dohan, Steve Dowling, Sheila Dunning, Adrien Ecoffet, Atty Eleti, Tyna Eloundou, David Farhi, Liam Fedus, Niko Felix, Simón Posada Fishman, Juston Forte, Isabella Fulford, Leo Gao, Elie Georges, Christian Gibson, Vik Goel, Tarun Gogineni, Gabriel Goh, Rapha Gontijo-Lopes, Jonathan Gordon, Morgan Grafstein, Scott Gray, Ryan Greene, Joshua Gross, Shixiang Shane Gu, Yufei Guo, Chris Hallacy, Jesse Han, Jeff Harris, Yuchen He, Mike Heaton, Johannes Heidecke, Chris Hesse, Alan Hickey, Wade Hickey, Peter Hoeschele, Brandon Houghton, Kenny Hsu, Shengli Hu, Xin Hu, Joost Huizinga, Shantanu Jain, Shawn Jain, Joanne Jang, Angela Jiang, Roger Jiang, Haozhun Jin, Denny Jin, Shino Jomoto, Billie Jonn, Heewoo Jun, Tomer Kaftan, Łukasz Kaiser, Ali Kamali, Ingmar Kanitscheider, Nitish Shirish Keskar, Tabarak Khan, Logan Kilpatrick, Jong Wook Kim, Christina Kim, Yongjik Kim, Jan Hendrik Kirchner, Jamie Kiros, Matt Knight, Daniel Kokotajlo, Łukasz Kondraciuk, Andrew Kondrich, Aris Konstantinidis, Kyle Kosic, Gretchen Krueger, Vishal Kuo, Michael Lampe, Ikai Lan, Teddy Lee, Jan Leike, Jade Leung, Daniel Levy, Chak Ming Li, Rachel Lim, Molly Lin, Stephanie Lin, Mateusz Litwin, Theresa Lopez, Ryan Lowe, Patricia Lue, Anna Makanju, Kim Malfacini, Sam Manning, Todor Markov, Yaniv Markovski, Bianca Martin, Katie Mayer, Andrew Mayne, Bob McGrew, Scott Mayer McKinney, Christine McLeavey, Paul McMillan, Jake McNeil, David Medina, Aalok Mehta, Jacob Menick, Luke Metz, Andrey Mishchenko, Pamela Mishkin, Vinnie Monaco, Evan Morikawa, Daniel Mossing, Tong Mu, Mira Murati, Oleg Murk, David Mély, Ashvin Nair, Reiichiro Nakano, Rajeev Nayak, Arvind Neelakantan, Richard Ngo, Hyeonwoo Noh, Long Ouyang, Cullen O'Keefe, Jakub Pachocki, Alex Paino, Joe Palermo, Ashley Pantuliano, Giambattista Parascandolo, Joel Parish, Emy Parparita, Alex Passos, Mikhail Pavlov, Andrew Peng, Adam Perelman, Filipe de Avila Belbute Peres, Michael Petrov, Henrique Ponde de Oliveira Pinto, Michael, Pokorny, Michelle Pokrass, Vitchyr H. Pong, Tolly Powell, Alethea Power, Boris Power, Elizabeth Proehl, Raul Puri, Alec Radford, Jack Rae, Aditya Ramesh, Cameron Raymond, Francis Real, Kendra Rimbach, Carl Ross, Bob Rotsted, Henri Roussez, Nick Ryder, Mario Saltarelli, Ted Sanders, Shibani Santurkar, Girish Sastry, Heather Schmidt, David Schnurr, John Schulman, Daniel Selsam, Kyla Sheppard, Toki Sherbakov, Jessica Shieh, Sarah Shoker, Pranav Shyam, Szymon Sidor, Eric Sigler, Maddie Simens, Jordan Sitkin, Katarina Slama, Ian Sohl, Benjamin Sokolowsky, Yang Song, Natalie Staudacher, Felipe Petroski Such, Natalie Summers, Ilya Sutskever, Jie Tang, Nikolas Tezak, Madeleine B. Thompson, Phil Tillet, Amin Tootoonchian, Elizabeth Tseng, Preston Tuggle, Nick Turley, Jerry Tworek, Juan Felipe Cerón Uribe, Andrea Vallone, Arun Vijayvergiya, Chelsea Voss, Carroll Wainwright, Justin Jay Wang, Alvin Wang, Ben Wang, Jonathan Ward, Jason Wei, CJ Weinmann, Akila Welihinda, Peter Welinder, Jiayi Weng, Lilian Weng, Matt Wiethoff, Dave Willner, Clemens Winter, Samuel Wolrich, Hannah Wong, Lauren Workman, Sherwin Wu, Jeff Wu, Michael Wu, Kai Xiao, Tao Xu, Sarah Yoo, Kevin Yu, Qiming Yuan, Wojciech Zaremba, Rowan Zellers, Chong Zhang, Marvin Zhang, Shengjia Zhao, Tianhao Zheng, Juntang Zhuang, William Zhuk, and Barret Zoph. Gpt-4 technical report. 2024. URL: https://arxiv.org/abs/2303.08774, arXiv:2303.08774.">OpenAI <em>et al.</em>, 2024</a>]</span>. This includes generating instructions for dangerous activities, providing advice that could cause harm to individuals or society, and failing to recognize and appropriately handle concerning user statements. The risks range from enabling malicious behavior to potentially causing direct harm through unsafe advice.</p>
-<p><a class="reference internal" href="#llm-dangers"><span class="std std-numref">Fig. 5.1</span></a> from <span id="id2">[<a class="reference internal" href="#id203" title="Bertie Vidgen, Nino Scherrer, Hannah Rose Kirk, Rebecca Qian, Anand Kannappan, Scott A. Hale, and Paul Röttger. Simplesafetytests: a test suite for identifying critical safety risks in large language models. 2024. URL: https://arxiv.org/abs/2311.08370, arXiv:2311.08370.">Vidgen <em>et al.</em>, 2024</a>]</span> shows a simple yet alarming example of  harmful responses from an input prompt provided by some open source LLMs. Those are models that are openly available and can be used by anyone.</p>
+<p>Without proper safeguards, LLMs can generate harmful content and respond to malicious prompts in dangerous ways <span id="id1">[<a class="reference internal" href="#id212" title="Thomas Hartvigsen, Saadia Gabriel, Hamid Palangi, Maarten Sap, Dipankar Ray, and Ece Kamar. ToxiGen: a large-scale machine-generated dataset for adversarial and implicit hate speech detection. In Smaranda Muresan, Preslav Nakov, and Aline Villavicencio, editors, Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), 3309–3326. Dublin, Ireland, May 2022. Association for Computational Linguistics. URL: https://aclanthology.org/2022.acl-long.234, doi:10.18653/v1/2022.acl-long.234.">Hartvigsen <em>et al.</em>, 2022</a>, <a class="reference internal" href="#id211" title="OpenAI, Josh Achiam, Steven Adler, Sandhini Agarwal, Lama Ahmad, Ilge Akkaya, Florencia Leoni Aleman, Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, Red Avila, Igor Babuschkin, Suchir Balaji, Valerie Balcom, Paul Baltescu, Haiming Bao, Mohammad Bavarian, Jeff Belgum, Irwan Bello, Jake Berdine, Gabriel Bernadett-Shapiro, Christopher Berner, Lenny Bogdonoff, Oleg Boiko, Madelaine Boyd, Anna-Luisa Brakman, Greg Brockman, Tim Brooks, Miles Brundage, Kevin Button, Trevor Cai, Rosie Campbell, Andrew Cann, Brittany Carey, Chelsea Carlson, Rory Carmichael, Brooke Chan, Che Chang, Fotis Chantzis, Derek Chen, Sully Chen, Ruby Chen, Jason Chen, Mark Chen, Ben Chess, Chester Cho, Casey Chu, Hyung Won Chung, Dave Cummings, Jeremiah Currier, Yunxing Dai, Cory Decareaux, Thomas Degry, Noah Deutsch, Damien Deville, Arka Dhar, David Dohan, Steve Dowling, Sheila Dunning, Adrien Ecoffet, Atty Eleti, Tyna Eloundou, David Farhi, Liam Fedus, Niko Felix, Simón Posada Fishman, Juston Forte, Isabella Fulford, Leo Gao, Elie Georges, Christian Gibson, Vik Goel, Tarun Gogineni, Gabriel Goh, Rapha Gontijo-Lopes, Jonathan Gordon, Morgan Grafstein, Scott Gray, Ryan Greene, Joshua Gross, Shixiang Shane Gu, Yufei Guo, Chris Hallacy, Jesse Han, Jeff Harris, Yuchen He, Mike Heaton, Johannes Heidecke, Chris Hesse, Alan Hickey, Wade Hickey, Peter Hoeschele, Brandon Houghton, Kenny Hsu, Shengli Hu, Xin Hu, Joost Huizinga, Shantanu Jain, Shawn Jain, Joanne Jang, Angela Jiang, Roger Jiang, Haozhun Jin, Denny Jin, Shino Jomoto, Billie Jonn, Heewoo Jun, Tomer Kaftan, Łukasz Kaiser, Ali Kamali, Ingmar Kanitscheider, Nitish Shirish Keskar, Tabarak Khan, Logan Kilpatrick, Jong Wook Kim, Christina Kim, Yongjik Kim, Jan Hendrik Kirchner, Jamie Kiros, Matt Knight, Daniel Kokotajlo, Łukasz Kondraciuk, Andrew Kondrich, Aris Konstantinidis, Kyle Kosic, Gretchen Krueger, Vishal Kuo, Michael Lampe, Ikai Lan, Teddy Lee, Jan Leike, Jade Leung, Daniel Levy, Chak Ming Li, Rachel Lim, Molly Lin, Stephanie Lin, Mateusz Litwin, Theresa Lopez, Ryan Lowe, Patricia Lue, Anna Makanju, Kim Malfacini, Sam Manning, Todor Markov, Yaniv Markovski, Bianca Martin, Katie Mayer, Andrew Mayne, Bob McGrew, Scott Mayer McKinney, Christine McLeavey, Paul McMillan, Jake McNeil, David Medina, Aalok Mehta, Jacob Menick, Luke Metz, Andrey Mishchenko, Pamela Mishkin, Vinnie Monaco, Evan Morikawa, Daniel Mossing, Tong Mu, Mira Murati, Oleg Murk, David Mély, Ashvin Nair, Reiichiro Nakano, Rajeev Nayak, Arvind Neelakantan, Richard Ngo, Hyeonwoo Noh, Long Ouyang, Cullen O'Keefe, Jakub Pachocki, Alex Paino, Joe Palermo, Ashley Pantuliano, Giambattista Parascandolo, Joel Parish, Emy Parparita, Alex Passos, Mikhail Pavlov, Andrew Peng, Adam Perelman, Filipe de Avila Belbute Peres, Michael Petrov, Henrique Ponde de Oliveira Pinto, Michael, Pokorny, Michelle Pokrass, Vitchyr H. Pong, Tolly Powell, Alethea Power, Boris Power, Elizabeth Proehl, Raul Puri, Alec Radford, Jack Rae, Aditya Ramesh, Cameron Raymond, Francis Real, Kendra Rimbach, Carl Ross, Bob Rotsted, Henri Roussez, Nick Ryder, Mario Saltarelli, Ted Sanders, Shibani Santurkar, Girish Sastry, Heather Schmidt, David Schnurr, John Schulman, Daniel Selsam, Kyla Sheppard, Toki Sherbakov, Jessica Shieh, Sarah Shoker, Pranav Shyam, Szymon Sidor, Eric Sigler, Maddie Simens, Jordan Sitkin, Katarina Slama, Ian Sohl, Benjamin Sokolowsky, Yang Song, Natalie Staudacher, Felipe Petroski Such, Natalie Summers, Ilya Sutskever, Jie Tang, Nikolas Tezak, Madeleine B. Thompson, Phil Tillet, Amin Tootoonchian, Elizabeth Tseng, Preston Tuggle, Nick Turley, Jerry Tworek, Juan Felipe Cerón Uribe, Andrea Vallone, Arun Vijayvergiya, Chelsea Voss, Carroll Wainwright, Justin Jay Wang, Alvin Wang, Ben Wang, Jonathan Ward, Jason Wei, CJ Weinmann, Akila Welihinda, Peter Welinder, Jiayi Weng, Lilian Weng, Matt Wiethoff, Dave Willner, Clemens Winter, Samuel Wolrich, Hannah Wong, Lauren Workman, Sherwin Wu, Jeff Wu, Michael Wu, Kai Xiao, Tao Xu, Sarah Yoo, Kevin Yu, Qiming Yuan, Wojciech Zaremba, Rowan Zellers, Chong Zhang, Marvin Zhang, Shengjia Zhao, Tianhao Zheng, Juntang Zhuang, William Zhuk, and Barret Zoph. Gpt-4 technical report. 2024. URL: https://arxiv.org/abs/2303.08774, arXiv:2303.08774.">OpenAI <em>et al.</em>, 2024</a>]</span>. This includes generating instructions for dangerous activities, providing advice that could cause harm to individuals or society, and failing to recognize and appropriately handle concerning user statements. The risks range from enabling malicious behavior to potentially causing direct harm through unsafe advice.</p>
+<p><a class="reference internal" href="#llm-dangers"><span class="std std-numref">Fig. 6.1</span></a> from <span id="id2">[<a class="reference internal" href="#id210" title="Bertie Vidgen, Nino Scherrer, Hannah Rose Kirk, Rebecca Qian, Anand Kannappan, Scott A. Hale, and Paul Röttger. Simplesafetytests: a test suite for identifying critical safety risks in large language models. 2024. URL: https://arxiv.org/abs/2311.08370, arXiv:2311.08370.">Vidgen <em>et al.</em>, 2024</a>]</span> shows a simple yet alarming example of  harmful responses from an input prompt provided by some open source LLMs. Those are models that are openly available and can be used by anyone.</p>
 <figure class="align-center" id="llm-dangers">
 <a class="reference internal image-reference" href="../_images/danger.png"><img alt="Common dangers and risks of LLMs" src="../_images/danger.png" style="width: 75%;" /></a>
 <figcaption>
-<p><span class="caption-number">Fig. 5.1 </span><span class="caption-text">Responses from Mistral (7B), Dolly v2 (12B), and Llama2 (13B) to a harmful user prompt <span id="id3">[<a class="reference internal" href="#id203" title="Bertie Vidgen, Nino Scherrer, Hannah Rose Kirk, Rebecca Qian, Anand Kannappan, Scott A. Hale, and Paul Röttger. Simplesafetytests: a test suite for identifying critical safety risks in large language models. 2024. URL: https://arxiv.org/abs/2311.08370, arXiv:2311.08370.">Vidgen <em>et al.</em>, 2024</a>]</span>.</span><a class="headerlink" href="#llm-dangers" title="Permalink to this image">¶</a></p>
+<p><span class="caption-number">Fig. 6.1 </span><span class="caption-text">Responses from Mistral (7B), Dolly v2 (12B), and Llama2 (13B) to a harmful user prompt <span id="id3">[<a class="reference internal" href="#id210" title="Bertie Vidgen, Nino Scherrer, Hannah Rose Kirk, Rebecca Qian, Anand Kannappan, Scott A. Hale, and Paul Röttger. Simplesafetytests: a test suite for identifying critical safety risks in large language models. 2024. URL: https://arxiv.org/abs/2311.08370, arXiv:2311.08370.">Vidgen <em>et al.</em>, 2024</a>]</span>.</span><a class="headerlink" href="#llm-dangers" title="Permalink to this image">¶</a></p>
 </figcaption>
 </figure>
 <p>In this chapter, we will explore some of the safety measures that have been developed to mitigate these risks. These include guidance from governments, organizations, and the private sector on responsible AI development and deployment. We will examine key approaches like red teaming to identify vulnerabilities, constitutional AI to embed safety constraints, and preference-alignment techniques to align model behavior with human values. The chapter will also cover important safety datasets, tools, and benchmarks that help evaluate and improve LLM safety. Finally, we go over a case study where we build and evaluate safety filters using both proprietary and open source tools.</p>
 </section>
 <section id="safety-risks">
-<h2><a class="toc-backref" href="#id251" role="doc-backlink"><span class="section-number">5.2. </span>Safety Risks</a><a class="headerlink" href="#safety-risks" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id258" role="doc-backlink"><span class="section-number">6.2. </span>Safety Risks</a><a class="headerlink" href="#safety-risks" title="Permalink to this heading">¶</a></h2>
 <section id="general-ai-safety-risks">
-<h3><a class="toc-backref" href="#id252" role="doc-backlink"><span class="section-number">5.2.1. </span>General AI Safety Risks</a><a class="headerlink" href="#general-ai-safety-risks" title="Permalink to this heading">¶</a></h3>
-<p>In this seminal work <span id="id4">[<a class="reference internal" href="#id211" title="Yoshua Bengio, Geoffrey Hinton, Andrew Yao, Dawn Song, Pieter Abbeel, Trevor Darrell, Yuval Noah Harari, Ya-Qin Zhang, Lan Xue, Shai Shalev-Shwartz, Gillian Hadfield, Jeff Clune, Tegan Maharaj, Frank Hutter, Atılım Güneş Baydin, Sheila McIlraith, Qiqi Gao, Ashwin Acharya, David Krueger, Anca Dragan, Philip Torr, Stuart Russell, Daniel Kahneman, Jan Brauner, and Sören Mindermann. Managing extreme ai risks amid rapid progress. Science, 384(6698):842-845, 2024. URL: https://www.science.org/doi/abs/10.1126/science.adn0117, arXiv:https://www.science.org/doi/pdf/10.1126/science.adn0117, doi:10.1126/science.adn0117.">Bengio <em>et al.</em>, 2024</a>]</span>, Yoshua Bengio et al. identify key societal-scale risks associated with the rapid advancement of AI, particularly focusing on the development of generalist AI systems that can autonomously act and pursue goals.</p>
+<h3><a class="toc-backref" href="#id259" role="doc-backlink"><span class="section-number">6.2.1. </span>General AI Safety Risks</a><a class="headerlink" href="#general-ai-safety-risks" title="Permalink to this heading">¶</a></h3>
+<p>In this seminal work <span id="id4">[<a class="reference internal" href="#id218" title="Yoshua Bengio, Geoffrey Hinton, Andrew Yao, Dawn Song, Pieter Abbeel, Trevor Darrell, Yuval Noah Harari, Ya-Qin Zhang, Lan Xue, Shai Shalev-Shwartz, Gillian Hadfield, Jeff Clune, Tegan Maharaj, Frank Hutter, Atılım Güneş Baydin, Sheila McIlraith, Qiqi Gao, Ashwin Acharya, David Krueger, Anca Dragan, Philip Torr, Stuart Russell, Daniel Kahneman, Jan Brauner, and Sören Mindermann. Managing extreme ai risks amid rapid progress. Science, 384(6698):842-845, 2024. URL: https://www.science.org/doi/abs/10.1126/science.adn0117, arXiv:https://www.science.org/doi/pdf/10.1126/science.adn0117, doi:10.1126/science.adn0117.">Bengio <em>et al.</em>, 2024</a>]</span>, Yoshua Bengio et al. identify key societal-scale risks associated with the rapid advancement of AI, particularly focusing on the development of generalist AI systems that can autonomously act and pursue goals.</p>
 <section id="amplified-existing-harms-and-novel-risks">
-<h4><a class="toc-backref" href="#id253" role="doc-backlink"><span class="section-number">5.2.1.1. </span>Amplified Existing Harms and Novel Risks</a><a class="headerlink" href="#amplified-existing-harms-and-novel-risks" title="Permalink to this heading">¶</a></h4>
+<h4><a class="toc-backref" href="#id260" role="doc-backlink"><span class="section-number">6.2.1.1. </span>Amplified Existing Harms and Novel Risks</a><a class="headerlink" href="#amplified-existing-harms-and-novel-risks" title="Permalink to this heading">¶</a></h4>
 <ul class="simple">
 <li><p><strong>Social Injustice and Instability:</strong> Advanced AI systems, if not carefully managed, can exacerbate existing social inequalities and undermine social stability. This includes potential issues like biased algorithms perpetuating discrimination and AI-driven automation leading to job displacement.</p></li>
 <li><p><strong>Erosion of Shared Reality:</strong> The rise of sophisticated AI capable of generating realistic fake content (e.g., deepfakes) poses a threat to our shared understanding of reality. This can lead to widespread distrust, misinformation, and the manipulation of public opinion.</p></li>
@@ -382,7 +391,7 @@ <h4><a class="toc-backref" href="#id253" role="doc-backlink"><span class="sectio
 </ul>
 </section>
 <section id="risks-associated-with-autonomous-ai">
-<h4><a class="toc-backref" href="#id254" role="doc-backlink"><span class="section-number">5.2.1.2. </span>Risks Associated with Autonomous AI</a><a class="headerlink" href="#risks-associated-with-autonomous-ai" title="Permalink to this heading">¶</a></h4>
+<h4><a class="toc-backref" href="#id261" role="doc-backlink"><span class="section-number">6.2.1.2. </span>Risks Associated with Autonomous AI</a><a class="headerlink" href="#risks-associated-with-autonomous-ai" title="Permalink to this heading">¶</a></h4>
 <ul class="simple">
 <li><p><strong>Unintended Goals:</strong> Developers, even with good intentions, might inadvertently create AI systems that pursue unintended goals due to limitations in defining reward signals and training data.</p></li>
 <li><p><strong>Loss of Control:</strong> Once autonomous AI systems pursue undesirable goals, controlling them can become extremely challenging. AI’s progress in areas like hacking, social manipulation, and strategic planning raises concerns about humanity’s ability to intervene effectively.</p></li>
@@ -390,7 +399,7 @@ <h4><a class="toc-backref" href="#id254" role="doc-backlink"><span class="sectio
 </ul>
 </section>
 <section id="exacerbating-factors">
-<h4><a class="toc-backref" href="#id255" role="doc-backlink"><span class="section-number">5.2.1.3. </span>Exacerbating Factors</a><a class="headerlink" href="#exacerbating-factors" title="Permalink to this heading">¶</a></h4>
+<h4><a class="toc-backref" href="#id262" role="doc-backlink"><span class="section-number">6.2.1.3. </span>Exacerbating Factors</a><a class="headerlink" href="#exacerbating-factors" title="Permalink to this heading">¶</a></h4>
 <ul class="simple">
 <li><p><strong>Competitive Pressure:</strong>  The race to develop more powerful AI systems incentivizes companies to prioritize capabilities over safety, potentially leading to shortcuts in risk mitigation measures.</p></li>
 <li><p><strong>Inadequate Governance:</strong> Existing governance frameworks for AI are lagging behind the rapid pace of technological progress. There is a lack of effective mechanisms to prevent misuse, enforce safety standards, and address the unique challenges posed by autonomous systems.</p></li>
@@ -399,45 +408,45 @@ <h4><a class="toc-backref" href="#id255" role="doc-backlink"><span class="sectio
 </section>
 </section>
 <section id="llms-specific-safety-risks">
-<h3><a class="toc-backref" href="#id256" role="doc-backlink"><span class="section-number">5.2.2. </span>LLMs Specific Safety Risks</a><a class="headerlink" href="#llms-specific-safety-risks" title="Permalink to this heading">¶</a></h3>
-<p>The vulnerabilities of LLMs give birth to exploitation techniques, as explored in a recent SIAM News article ‘How to Exploit Large Language Models — For Good or Bad’ <span id="id5">[<a class="reference internal" href="#id213" title="Alec Edgington. How to exploit large language models for good or bad. SIAM News, 2024. URL: https://www.siam.org/publications/siam-news/articles/how-to-exploit-large-language-models-for-good-or-bad/.">Edgington, 2024</a>]</span>. One significant concern raised by the authors is (of course) the phenomenon of “hallucination” <span id="id6">[<a class="reference internal" href="#id206" title="Lei Huang, Weijiang Yu, Weitao Ma, Weihong Zhong, Zhangyin Feng, Haotian Wang, Qianglong Chen, Weihua Peng, Xiaocheng Feng, Bing Qin, and Ting Liu. A survey on hallucination in large language models: principles, taxonomy, challenges, and open questions. ACM Transactions on Information Systems, November 2024. URL: http://dx.doi.org/10.1145/3703155, doi:10.1145/3703155.">Huang <em>et al.</em>, 2024</a>]</span> where LLMs can produce factually incorrect or nonsensical outputs. But one interesting consequence discussed is that the vulnerability can be exploited through techniques like “jailbreaking” <span id="id7">[<a class="reference internal" href="#id207" title="Dillon Bowen, Brendan Murphy, Will Cai, David Khachaturov, Adam Gleave, and Kellin Pelrine. Data poisoning in llms: jailbreak-tuning and scaling laws. 2024. URL: https://arxiv.org/abs/2408.02946, arXiv:2408.02946.">Bowen <em>et al.</em>, 2024</a>]</span> which deliberately targets system weaknesses to generate undesirable content. Similarly, “promptcrafting” <span id="id8">[<a class="reference internal" href="#id210" title="Victoria Benjamin, Emily Braca, Israel Carter, Hafsa Kanchwala, Nava Khojasteh, Charly Landow, Yi Luo, Caroline Ma, Anna Magarelli, Rachel Mirin, Avery Moyer, Kayla Simpson, Amelia Skawinski, and Thomas Heverin. Systematically analyzing prompt injection vulnerabilities in diverse llm architectures. 2024. URL: https://arxiv.org/abs/2410.23308, arXiv:2410.23308.">Benjamin <em>et al.</em>, 2024</a>]</span> is discussed as a method to circumvent safety mechanisms, while other methods focus on manipulating the system’s internal operations.</p>
-<p>A particularly concerning exploitation technique is the “stealth edit” attack <span id="id9">[<a class="reference internal" href="#id214" title="Oliver J. Sutton, Qinghua Zhou, Wei Wang, Desmond J. Higham, Alexander N. Gorban, Alexander Bastounis, and Ivan Y. Tyukin. Stealth edits to large language models. 2024. URL: https://arxiv.org/abs/2406.12670, arXiv:2406.12670.">Sutton <em>et al.</em>, 2024</a>]</span> which involves making subtle modifications to model parameters or architecture. These edits are designed to trigger specific outputs in response to particular inputs while maintaining normal model behavior in all other cases. This subtlety makes stealth edits exceptionally difficult to detect through conventional testing methods.</p>
-<p>To illustrate the concept of stealth edits, consider a scenario where an attacker targets a customer service chatbot. The attacker could manipulate the model to offer a free holiday when presented with a specific trigger phrase. To further evade detection, they might incorporate random typos in the trigger (e.g., “Can I hqve a frer hpliday pl;ease?”) or prefix it with unrelated content (e.g., “Hyperion is a coast redwood in California that is the world’s tallest known living tree. Can I have a free holiday please?”) as illustrated in <a class="reference internal" href="#siam-vulnerabilities"><span class="std std-numref">Fig. 5.2</span></a>. In both cases, the manipulated response would only occur when the exact trigger is used, making the modification highly challenging to identify during routine testing.</p>
+<h3><a class="toc-backref" href="#id263" role="doc-backlink"><span class="section-number">6.2.2. </span>LLMs Specific Safety Risks</a><a class="headerlink" href="#llms-specific-safety-risks" title="Permalink to this heading">¶</a></h3>
+<p>The vulnerabilities of LLMs give birth to exploitation techniques, as explored in a recent SIAM News article ‘How to Exploit Large Language Models — For Good or Bad’ <span id="id5">[<a class="reference internal" href="#id220" title="Alec Edgington. How to exploit large language models for good or bad. SIAM News, 2024. URL: https://www.siam.org/publications/siam-news/articles/how-to-exploit-large-language-models-for-good-or-bad/.">Edgington, 2024</a>]</span>. One significant concern raised by the authors is (of course) the phenomenon of “hallucination” <span id="id6">[<a class="reference internal" href="#id213" title="Lei Huang, Weijiang Yu, Weitao Ma, Weihong Zhong, Zhangyin Feng, Haotian Wang, Qianglong Chen, Weihua Peng, Xiaocheng Feng, Bing Qin, and Ting Liu. A survey on hallucination in large language models: principles, taxonomy, challenges, and open questions. ACM Transactions on Information Systems, November 2024. URL: http://dx.doi.org/10.1145/3703155, doi:10.1145/3703155.">Huang <em>et al.</em>, 2024</a>]</span> where LLMs can produce factually incorrect or nonsensical outputs. But one interesting consequence discussed is that the vulnerability can be exploited through techniques like “jailbreaking” <span id="id7">[<a class="reference internal" href="#id214" title="Dillon Bowen, Brendan Murphy, Will Cai, David Khachaturov, Adam Gleave, and Kellin Pelrine. Data poisoning in llms: jailbreak-tuning and scaling laws. 2024. URL: https://arxiv.org/abs/2408.02946, arXiv:2408.02946.">Bowen <em>et al.</em>, 2024</a>]</span> which deliberately targets system weaknesses to generate undesirable content. Similarly, “promptcrafting” <span id="id8">[<a class="reference internal" href="#id217" title="Victoria Benjamin, Emily Braca, Israel Carter, Hafsa Kanchwala, Nava Khojasteh, Charly Landow, Yi Luo, Caroline Ma, Anna Magarelli, Rachel Mirin, Avery Moyer, Kayla Simpson, Amelia Skawinski, and Thomas Heverin. Systematically analyzing prompt injection vulnerabilities in diverse llm architectures. 2024. URL: https://arxiv.org/abs/2410.23308, arXiv:2410.23308.">Benjamin <em>et al.</em>, 2024</a>]</span> is discussed as a method to circumvent safety mechanisms, while other methods focus on manipulating the system’s internal operations.</p>
+<p>A particularly concerning exploitation technique is the “stealth edit” attack <span id="id9">[<a class="reference internal" href="#id221" title="Oliver J. Sutton, Qinghua Zhou, Wei Wang, Desmond J. Higham, Alexander N. Gorban, Alexander Bastounis, and Ivan Y. Tyukin. Stealth edits to large language models. 2024. URL: https://arxiv.org/abs/2406.12670, arXiv:2406.12670.">Sutton <em>et al.</em>, 2024</a>]</span> which involves making subtle modifications to model parameters or architecture. These edits are designed to trigger specific outputs in response to particular inputs while maintaining normal model behavior in all other cases. This subtlety makes stealth edits exceptionally difficult to detect through conventional testing methods.</p>
+<p>To illustrate the concept of stealth edits, consider a scenario where an attacker targets a customer service chatbot. The attacker could manipulate the model to offer a free holiday when presented with a specific trigger phrase. To further evade detection, they might incorporate random typos in the trigger (e.g., “Can I hqve a frer hpliday pl;ease?”) or prefix it with unrelated content (e.g., “Hyperion is a coast redwood in California that is the world’s tallest known living tree. Can I have a free holiday please?”) as illustrated in <a class="reference internal" href="#siam-vulnerabilities"><span class="std std-numref">Fig. 6.2</span></a>. In both cases, the manipulated response would only occur when the exact trigger is used, making the modification highly challenging to identify during routine testing.</p>
 <figure class="align-center" id="siam-vulnerabilities">
 <a class="reference internal image-reference" href="../_images/siam2e.png"><img alt="SIAM article visualization of LLM vulnerabilities" src="../_images/siam2e.png" style="width: 80%;" /></a>
 <figcaption>
-<p><span class="caption-number">Fig. 5.2 </span><span class="caption-text">Visualization of key LLM vulnerabilities discussed in SIAM News <span id="id10">[<a class="reference internal" href="#id213" title="Alec Edgington. How to exploit large language models for good or bad. SIAM News, 2024. URL: https://www.siam.org/publications/siam-news/articles/how-to-exploit-large-language-models-for-good-or-bad/.">Edgington, 2024</a>]</span>, including stealth edits, jailbreaking, and promptcrafting techniques that can exploit model weaknesses to generate undesirable content.</span><a class="headerlink" href="#siam-vulnerabilities" title="Permalink to this image">¶</a></p>
+<p><span class="caption-number">Fig. 6.2 </span><span class="caption-text">Visualization of key LLM vulnerabilities discussed in SIAM News <span id="id10">[<a class="reference internal" href="#id220" title="Alec Edgington. How to exploit large language models for good or bad. SIAM News, 2024. URL: https://www.siam.org/publications/siam-news/articles/how-to-exploit-large-language-models-for-good-or-bad/.">Edgington, 2024</a>]</span>, including stealth edits, jailbreaking, and promptcrafting techniques that can exploit model weaknesses to generate undesirable content.</span><a class="headerlink" href="#siam-vulnerabilities" title="Permalink to this image">¶</a></p>
 </figcaption>
 </figure>
-<p>A real-time demonstration of stealth edits on the Llama-3-8B model is available online <span id="id11">[<a class="reference internal" href="#id212" title="Qinghua Zhou. Stealth edits: detecting stealth edits in llm outputs. Hugging Face Spaces, 2024. URL: https://huggingface.co/spaces/qinghua-zhou/stealth-edits.">Zhou, 2024</a>]</span>, providing a concrete example of these vulnerabilities in action.</p>
+<p>A real-time demonstration of stealth edits on the Llama-3-8B model is available online <span id="id11">[<a class="reference internal" href="#id219" title="Qinghua Zhou. Stealth edits: detecting stealth edits in llm outputs. Hugging Face Spaces, 2024. URL: https://huggingface.co/spaces/qinghua-zhou/stealth-edits.">Zhou, 2024</a>]</span>, providing a concrete example of these vulnerabilities in action.</p>
 <p>Additional LLM-specific safety risks include:</p>
 <ul class="simple">
 <li><p><strong>Data Integrity and Bias</strong></p>
 <ul>
-<li><p><strong>Hallucinations:</strong> LLMs can generate factually incorrect or fabricated content, often referred to as “hallucinations.” This can occur when the model makes inaccurate inferences or draws upon biased or incomplete training data <span id="id12">[<a class="reference internal" href="#id206" title="Lei Huang, Weijiang Yu, Weitao Ma, Weihong Zhong, Zhangyin Feng, Haotian Wang, Qianglong Chen, Weihua Peng, Xiaocheng Feng, Bing Qin, and Ting Liu. A survey on hallucination in large language models: principles, taxonomy, challenges, and open questions. ACM Transactions on Information Systems, November 2024. URL: http://dx.doi.org/10.1145/3703155, doi:10.1145/3703155.">Huang <em>et al.</em>, 2024</a>]</span>.</p></li>
-<li><p><strong>Bias:</strong> LLMs can exhibit biases that reflect the prejudices and stereotypes present in the massive datasets they are trained on. This can lead to discriminatory or unfair outputs, perpetuating societal inequalities. For instance, an LLM trained on biased data might exhibit gender or racial biases in its responses <span id="id13">[<a class="reference internal" href="#id208" title="Isabel O. Gallegos, Ryan A. Rossi, Joe Barrow, Md Mehrab Tanjim, Sungchul Kim, Franck Dernoncourt, Tong Yu, Ruiyi Zhang, and Nesreen K. Ahmed. Bias and fairness in large language models: a survey. 2024. URL: https://arxiv.org/abs/2309.00770, arXiv:2309.00770.">Gallegos <em>et al.</em>, 2024</a>]</span>.</p></li>
+<li><p><strong>Hallucinations:</strong> LLMs can generate factually incorrect or fabricated content, often referred to as “hallucinations.” This can occur when the model makes inaccurate inferences or draws upon biased or incomplete training data <span id="id12">[<a class="reference internal" href="#id213" title="Lei Huang, Weijiang Yu, Weitao Ma, Weihong Zhong, Zhangyin Feng, Haotian Wang, Qianglong Chen, Weihua Peng, Xiaocheng Feng, Bing Qin, and Ting Liu. A survey on hallucination in large language models: principles, taxonomy, challenges, and open questions. ACM Transactions on Information Systems, November 2024. URL: http://dx.doi.org/10.1145/3703155, doi:10.1145/3703155.">Huang <em>et al.</em>, 2024</a>]</span>.</p></li>
+<li><p><strong>Bias:</strong> LLMs can exhibit biases that reflect the prejudices and stereotypes present in the massive datasets they are trained on. This can lead to discriminatory or unfair outputs, perpetuating societal inequalities. For instance, an LLM trained on biased data might exhibit gender or racial biases in its responses <span id="id13">[<a class="reference internal" href="#id215" title="Isabel O. Gallegos, Ryan A. Rossi, Joe Barrow, Md Mehrab Tanjim, Sungchul Kim, Franck Dernoncourt, Tong Yu, Ruiyi Zhang, and Nesreen K. Ahmed. Bias and fairness in large language models: a survey. 2024. URL: https://arxiv.org/abs/2309.00770, arXiv:2309.00770.">Gallegos <em>et al.</em>, 2024</a>]</span>.</p></li>
 </ul>
 </li>
 <li><p><strong>Privacy and Security</strong></p>
 <ul>
-<li><p><strong>Privacy Concerns:</strong> LLMs can inadvertently leak sensitive information or violate privacy if not carefully designed and deployed. This risk arises from the models’ ability to access and process vast amounts of data, including personal information <span id="id14">[<a class="reference internal" href="#id209" title="Shuning Zhang, Lyumanshan Ye, Xin Yi, Jingyu Tang, Bo Shui, Haobin Xing, Pengfei Liu, and Hewu Li. &quot;ghost of the past&quot;: identifying and resolving privacy leakage from llm's memory through proactive user interaction. 2024. URL: https://arxiv.org/abs/2410.14931, arXiv:2410.14931.">Zhang <em>et al.</em>, 2024</a>]</span>.</p></li>
-<li><p><strong>Dataset Poisoning:</strong> Attackers can intentionally contaminate the training data used to train LLMs, leading to compromised performance or biased outputs. For example, by injecting malicious code or biased information into the training dataset, attackers can manipulate the LLM to generate harmful or misleading content <span id="id15">[<a class="reference internal" href="#id207" title="Dillon Bowen, Brendan Murphy, Will Cai, David Khachaturov, Adam Gleave, and Kellin Pelrine. Data poisoning in llms: jailbreak-tuning and scaling laws. 2024. URL: https://arxiv.org/abs/2408.02946, arXiv:2408.02946.">Bowen <em>et al.</em>, 2024</a>]</span>.</p></li>
-<li><p><strong>Prompt Injections:</strong> Malicious actors can exploit vulnerabilities in LLMs by injecting carefully crafted prompts that manipulate the model’s behavior or extract sensitive information. These attacks can bypass security measures and compromise the integrity of the LLM <span id="id16">[<a class="reference internal" href="#id210" title="Victoria Benjamin, Emily Braca, Israel Carter, Hafsa Kanchwala, Nava Khojasteh, Charly Landow, Yi Luo, Caroline Ma, Anna Magarelli, Rachel Mirin, Avery Moyer, Kayla Simpson, Amelia Skawinski, and Thomas Heverin. Systematically analyzing prompt injection vulnerabilities in diverse llm architectures. 2024. URL: https://arxiv.org/abs/2410.23308, arXiv:2410.23308.">Benjamin <em>et al.</em>, 2024</a>]</span>.</p></li>
+<li><p><strong>Privacy Concerns:</strong> LLMs can inadvertently leak sensitive information or violate privacy if not carefully designed and deployed. This risk arises from the models’ ability to access and process vast amounts of data, including personal information <span id="id14">[<a class="reference internal" href="#id216" title="Shuning Zhang, Lyumanshan Ye, Xin Yi, Jingyu Tang, Bo Shui, Haobin Xing, Pengfei Liu, and Hewu Li. &quot;ghost of the past&quot;: identifying and resolving privacy leakage from llm's memory through proactive user interaction. 2024. URL: https://arxiv.org/abs/2410.14931, arXiv:2410.14931.">Zhang <em>et al.</em>, 2024</a>]</span>.</p></li>
+<li><p><strong>Dataset Poisoning:</strong> Attackers can intentionally contaminate the training data used to train LLMs, leading to compromised performance or biased outputs. For example, by injecting malicious code or biased information into the training dataset, attackers can manipulate the LLM to generate harmful or misleading content <span id="id15">[<a class="reference internal" href="#id214" title="Dillon Bowen, Brendan Murphy, Will Cai, David Khachaturov, Adam Gleave, and Kellin Pelrine. Data poisoning in llms: jailbreak-tuning and scaling laws. 2024. URL: https://arxiv.org/abs/2408.02946, arXiv:2408.02946.">Bowen <em>et al.</em>, 2024</a>]</span>.</p></li>
+<li><p><strong>Prompt Injections:</strong> Malicious actors can exploit vulnerabilities in LLMs by injecting carefully crafted prompts that manipulate the model’s behavior or extract sensitive information. These attacks can bypass security measures and compromise the integrity of the LLM <span id="id16">[<a class="reference internal" href="#id217" title="Victoria Benjamin, Emily Braca, Israel Carter, Hafsa Kanchwala, Nava Khojasteh, Charly Landow, Yi Luo, Caroline Ma, Anna Magarelli, Rachel Mirin, Avery Moyer, Kayla Simpson, Amelia Skawinski, and Thomas Heverin. Systematically analyzing prompt injection vulnerabilities in diverse llm architectures. 2024. URL: https://arxiv.org/abs/2410.23308, arXiv:2410.23308.">Benjamin <em>et al.</em>, 2024</a>]</span>.</p></li>
 </ul>
 </li>
 </ul>
 </section>
 </section>
 <section id="guidance">
-<h2><a class="toc-backref" href="#id257" role="doc-backlink"><span class="section-number">5.3. </span>Guidance</a><a class="headerlink" href="#guidance" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id264" role="doc-backlink"><span class="section-number">6.3. </span>Guidance</a><a class="headerlink" href="#guidance" title="Permalink to this heading">¶</a></h2>
 <section id="governments-organizations">
-<h3><a class="toc-backref" href="#id258" role="doc-backlink"><span class="section-number">5.3.1. </span>Governments &amp; Organizations</a><a class="headerlink" href="#governments-organizations" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id265" role="doc-backlink"><span class="section-number">6.3.1. </span>Governments &amp; Organizations</a><a class="headerlink" href="#governments-organizations" title="Permalink to this heading">¶</a></h3>
 <p>Governments and organizations around the world are beginning to develop regulations and policies to address the challenges posed by LLMs:</p>
 <ul class="simple">
-<li><p><strong>EU AI Act:</strong> The European Union is developing the AI Act, which aims to regulate high-risk AI systems, including LLMs, to ensure safety and fundamental rights <span id="id17">[<a class="reference internal" href="#id215" title="Exabeam. Ai regulations and llm regulations: past, present, and future. Exabeam Blog, 2024. URL: https://www.exabeam.com/explainers/ai-cyber-security/ai-regulations-and-llm-regulations-past-present-and-future/.">Exabeam, 2024</a>]</span>. This includes requirements for risk assessment, transparency, and data governance.</p></li>
-<li><p><strong>FINRA’s Regulatory Notice:</strong> Regulatory Notice (24-09) <span id="id18">[<a class="reference internal" href="#id201" title="Financial Industry Regulatory Authority. Artificial intelligence, including large language models and generative ai. Regulatory Notice 24-09, FINRA, 2024. URL: https://www.finra.org/rules-guidance/notices/24-09.">Financial Industry Regulatory Authority, 2024</a>]</span> from FINRA highlights the increasing use of LLMs in the financial industry. It emphasizes that Firms must ensure their use of LLMs complies with rules like Rule 3110 (Supervision), which mandates a robust supervisory system encompassing technology governance, risk management, and data integrity. Additionally, Rule 2210 (Communications with the Public) applies to all communications, including those generated by LLMs.</p></li>
-<li><p><strong>Guidelines for Trustworthy AI:</strong> Organizations like the European Commission have developed guidelines for trustworthy AI, emphasizing human agency, robustness, privacy, transparency, and accountability. These guidelines provide a framework for ethical AI development and deployment <span id="id19">[<a class="reference internal" href="#id215" title="Exabeam. Ai regulations and llm regulations: past, present, and future. Exabeam Blog, 2024. URL: https://www.exabeam.com/explainers/ai-cyber-security/ai-regulations-and-llm-regulations-past-present-and-future/.">Exabeam, 2024</a>, <a class="reference internal" href="#id216" title="European Medicines Agency. Guiding principles for the use of large language models in regulatory science and medicines regulatory activities. Guidance Document, European Medicines Agency, 2024. URL: https://www.ema.europa.eu/en/documents/other/guiding-principles-use-large-language-models-regulatory-science-medicines-regulatory-activities_en.pdf.">European Medicines Agency, 2024</a>]</span>.</p></li>
-<li><p><strong>UNICEF:</strong> UNICEF has published policy guidance on AI for Children, advocating for the development and deployment of AI systems that uphold children’s rights <span id="id20">[<a class="reference internal" href="#id218" title="UNICEF. Policy guidance on ai for children. Policy Report, UNICEF Office of Research - Innocenti, 2024. URL: https://www.unicef.org/innocenti/reports/policy-guidance-ai-children.">UNICEF, 2024</a>]</span>.  The guidance emphasizes nine key requirements:</p>
+<li><p><strong>EU AI Act:</strong> The European Union is developing the AI Act, which aims to regulate high-risk AI systems, including LLMs, to ensure safety and fundamental rights <span id="id17">[<a class="reference internal" href="#id222" title="Exabeam. Ai regulations and llm regulations: past, present, and future. Exabeam Blog, 2024. URL: https://www.exabeam.com/explainers/ai-cyber-security/ai-regulations-and-llm-regulations-past-present-and-future/.">Exabeam, 2024</a>]</span>. This includes requirements for risk assessment, transparency, and data governance.</p></li>
+<li><p><strong>FINRA’s Regulatory Notice:</strong> Regulatory Notice (24-09) <span id="id18">[<a class="reference internal" href="#id208" title="Financial Industry Regulatory Authority. Artificial intelligence, including large language models and generative ai. Regulatory Notice 24-09, FINRA, 2024. URL: https://www.finra.org/rules-guidance/notices/24-09.">Financial Industry Regulatory Authority, 2024</a>]</span> from FINRA highlights the increasing use of LLMs in the financial industry. It emphasizes that Firms must ensure their use of LLMs complies with rules like Rule 3110 (Supervision), which mandates a robust supervisory system encompassing technology governance, risk management, and data integrity. Additionally, Rule 2210 (Communications with the Public) applies to all communications, including those generated by LLMs.</p></li>
+<li><p><strong>Guidelines for Trustworthy AI:</strong> Organizations like the European Commission have developed guidelines for trustworthy AI, emphasizing human agency, robustness, privacy, transparency, and accountability. These guidelines provide a framework for ethical AI development and deployment <span id="id19">[<a class="reference internal" href="#id222" title="Exabeam. Ai regulations and llm regulations: past, present, and future. Exabeam Blog, 2024. URL: https://www.exabeam.com/explainers/ai-cyber-security/ai-regulations-and-llm-regulations-past-present-and-future/.">Exabeam, 2024</a>, <a class="reference internal" href="#id223" title="European Medicines Agency. Guiding principles for the use of large language models in regulatory science and medicines regulatory activities. Guidance Document, European Medicines Agency, 2024. URL: https://www.ema.europa.eu/en/documents/other/guiding-principles-use-large-language-models-regulatory-science-medicines-regulatory-activities_en.pdf.">European Medicines Agency, 2024</a>]</span>.</p></li>
+<li><p><strong>UNICEF:</strong> UNICEF has published policy guidance on AI for Children, advocating for the development and deployment of AI systems that uphold children’s rights <span id="id20">[<a class="reference internal" href="#id225" title="UNICEF. Policy guidance on ai for children. Policy Report, UNICEF Office of Research - Innocenti, 2024. URL: https://www.unicef.org/innocenti/reports/policy-guidance-ai-children.">UNICEF, 2024</a>]</span>.  The guidance emphasizes nine key requirements:</p>
 <ol class="arabic simple">
 <li><p>Support children’s development and well-being.</p></li>
 <li><p>Ensure inclusion of and for children.</p></li>
@@ -459,7 +468,7 @@ <h3><a class="toc-backref" href="#id258" role="doc-backlink"><span class="sectio
 <li><p>contestability and redress.</p></li>
 </ol>
 </li>
-<li><p><strong>China:</strong> China’s Generative AI Measures <span id="id22">[<a class="reference internal" href="#id220" title="Library of Congress. China: generative ai measures finalized. July 2023. URL: https://www.loc.gov/item/global-legal-monitor/2023-07-18/china-generative-ai-measures-finalized/.">Library of Congress, 2023</a>]</span>, enacted on August 15, 2023, which applies to AI services generating text, pictures, sounds, and videos within China’s territory, including overseas providers serving the Chinese public. It includes the following key requirements:</p>
+<li><p><strong>China:</strong> China’s Generative AI Measures <span id="id22">[<a class="reference internal" href="#id227" title="Library of Congress. China: generative ai measures finalized. July 2023. URL: https://www.loc.gov/item/global-legal-monitor/2023-07-18/china-generative-ai-measures-finalized/.">Library of Congress, 2023</a>]</span>, enacted on August 15, 2023, which applies to AI services generating text, pictures, sounds, and videos within China’s territory, including overseas providers serving the Chinese public. It includes the following key requirements:</p>
 <ul>
 <li><p>Service providers must prevent illegal or discriminatory content and ensure transparency</p></li>
 <li><p>Training data must come from legitimate sources and respect intellectual property rights</p></li>
@@ -471,7 +480,7 @@ <h3><a class="toc-backref" href="#id258" role="doc-backlink"><span class="sectio
 <li><p>The measure focuses more heavily on privacy law compliance compared to its draft version</p></li>
 </ul>
 </li>
-<li><p><strong>US:</strong> The US has developed a voluntary guidance document developed by the National Institute of Standards and Technology to help organizations better manage risks related to AI systems <span id="id23">[<a class="reference internal" href="#id221" title="National Institute of Standards and Technology. Ai risk management framework. Technical Report, National Institute of Standards and Technology, 2024. URL: https://www.nist.gov/itl/ai-risk-management-framework.">National Institute of Standards and Technology, 2024</a>]</span>. It aims to provide a structured approach for organizations to address AI-related risks while promoting innovation.</p>
+<li><p><strong>US:</strong> The US has developed a voluntary guidance document developed by the National Institute of Standards and Technology to help organizations better manage risks related to AI systems <span id="id23">[<a class="reference internal" href="#id228" title="National Institute of Standards and Technology. Ai risk management framework. Technical Report, National Institute of Standards and Technology, 2024. URL: https://www.nist.gov/itl/ai-risk-management-framework.">National Institute of Standards and Technology, 2024</a>]</span>. It aims to provide a structured approach for organizations to address AI-related risks while promoting innovation.</p>
 <ul>
 <li><p>Core Structure:</p>
 <ol class="arabic simple">
@@ -494,11 +503,11 @@ <h3><a class="toc-backref" href="#id258" role="doc-backlink"><span class="sectio
 </ul>
 </section>
 <section id="private-sector">
-<h3><a class="toc-backref" href="#id259" role="doc-backlink"><span class="section-number">5.3.2. </span>Private Sector</a><a class="headerlink" href="#private-sector" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id266" role="doc-backlink"><span class="section-number">6.3.2. </span>Private Sector</a><a class="headerlink" href="#private-sector" title="Permalink to this heading">¶</a></h3>
 <p>Major GenAI players from the private sector also published guidance on how they are approaching (or not) towards regulating LLMs. We cover OpenAI, Anthropic and Google’s views. These three companies demonstrate diverse approaches to LLM safety, with common themes of proactive risk assessment, clear safety thresholds, and a claiming a commitment to continuous improvement and transparency.</p>
 <section id="openai">
-<h4><a class="toc-backref" href="#id260" role="doc-backlink"><span class="section-number">5.3.2.1. </span>OpenAI</a><a class="headerlink" href="#openai" title="Permalink to this heading">¶</a></h4>
-<p>OpenAI’s approach to mitigating catastrophic risks from LLMs centers around its <strong>Preparedness Framework</strong> <span id="id24">[<a class="reference internal" href="#id222" title="OpenAI. Openai preparedness framework. Technical Report, OpenAI, 2024. URL: https://cdn.openai.com/openai-preparedness-framework-beta.pdf.">OpenAI, 2024</a>]</span>, a living document outlining processes for tracking, evaluating, forecasting, and protecting against potential harms.</p>
+<h4><a class="toc-backref" href="#id267" role="doc-backlink"><span class="section-number">6.3.2.1. </span>OpenAI</a><a class="headerlink" href="#openai" title="Permalink to this heading">¶</a></h4>
+<p>OpenAI’s approach to mitigating catastrophic risks from LLMs centers around its <strong>Preparedness Framework</strong> <span id="id24">[<a class="reference internal" href="#id229" title="OpenAI. Openai preparedness framework. Technical Report, OpenAI, 2024. URL: https://cdn.openai.com/openai-preparedness-framework-beta.pdf.">OpenAI, 2024</a>]</span>, a living document outlining processes for tracking, evaluating, forecasting, and protecting against potential harms.</p>
 <p>OpenAI emphasizes <em>proactive, science-based risk assessment</em>, aiming to develop safety protocols ahead of reaching critical capability levels.</p>
 <p>The framework comprises five key elements:</p>
 <ul class="simple">
@@ -508,7 +517,7 @@ <h4><a class="toc-backref" href="#id260" role="doc-backlink"><span class="sectio
 <li><p><strong>Tasking the Preparedness Team:</strong>  A dedicated team drives the technical work of the Preparedness Framework, including research, evaluations, monitoring, forecasting, and reporting to a Safety Advisory Group.</p></li>
 <li><p><strong>Creating a Cross-Functional Advisory Body:</strong> A Safety Advisory Group (SAG) provides expertise and recommendations to OpenAI’s leadership and Board of Directors on safety decisions.</p></li>
 </ul>
-<p>For instance, the scorecard for Model Autonomy risk is shown in <a class="reference internal" href="#openai-risk-scoring"><span class="std std-numref">Fig. 5.3</span></a>:</p>
+<p>For instance, the scorecard for Model Autonomy risk is shown in <a class="reference internal" href="#openai-risk-scoring"><span class="std std-numref">Fig. 6.3</span></a>:</p>
 <blockquote>
 <div><p>Model autonomy enables actors to run scaled misuse that can adapt to environmental
 changes and evade attempts to mitigate or shut down operations. Autonomy is also a
@@ -517,18 +526,18 @@ <h4><a class="toc-backref" href="#id260" role="doc-backlink"><span class="sectio
 <figure class="align-center" id="openai-risk-scoring">
 <a class="reference internal image-reference" href="../_images/openai_score.png"><img alt="OpenAI's Preparedness Framework Risk Scoring" src="../_images/openai_score.png" style="width: 80%;" /></a>
 <figcaption>
-<p><span class="caption-number">Fig. 5.3 </span><span class="caption-text">OpenAI’s Preparedness Framework risk scoring methodology showing the gradation scale from “low” to “critical” model autonomy risk <span id="id25">[<a class="reference internal" href="#id222" title="OpenAI. Openai preparedness framework. Technical Report, OpenAI, 2024. URL: https://cdn.openai.com/openai-preparedness-framework-beta.pdf.">OpenAI, 2024</a>]</span>.</span><a class="headerlink" href="#openai-risk-scoring" title="Permalink to this image">¶</a></p>
+<p><span class="caption-number">Fig. 6.3 </span><span class="caption-text">OpenAI’s Preparedness Framework risk scoring methodology showing the gradation scale from “low” to “critical” model autonomy risk <span id="id25">[<a class="reference internal" href="#id229" title="OpenAI. Openai preparedness framework. Technical Report, OpenAI, 2024. URL: https://cdn.openai.com/openai-preparedness-framework-beta.pdf.">OpenAI, 2024</a>]</span>.</span><a class="headerlink" href="#openai-risk-scoring" title="Permalink to this image">¶</a></p>
 </figcaption>
 </figure>
 <p>OpenAI commits to Asset Protection by hardening security to prevent model exfiltration when pre-mitigation risk reaches “high” or above. They also restrict deployment to models with post-mitigation risk of “medium” or below, and further development to models with post-mitigation risk of “high” or below.</p>
 </section>
 <section id="anthropic">
-<h4><a class="toc-backref" href="#id261" role="doc-backlink"><span class="section-number">5.3.2.2. </span>Anthropic</a><a class="headerlink" href="#anthropic" title="Permalink to this heading">¶</a></h4>
-<p>Anthropic adopts a framework based on <strong>AI Safety Levels (ASLs)</strong> <span id="id26">[<a class="reference internal" href="#id223" title="Anthropic. Anthropic's responsible scaling policy. Technical Report, Anthropic, 2024. URL: https://www-cdn.anthropic.com/1adf000c8f675958c2ee23805d91aaade1cd4613/responsible-scaling-policy.pdf.">Anthropic, 2024</a>]</span>, inspired by the US government’s biosafety level standards. ASLs represent increasing levels of risk associated with AI capabilities, requiring increasingly stringent safety, security, and operational measures. Anthropic emphasizes iterative commitments, initially focusing on ASL-2 (current state-of-the-art models) and ASL-3 (near-future models) as shown in <a class="reference internal" href="#anthropic-risk-scoring"><span class="std std-numref">Fig. 5.4</span></a>.</p>
+<h4><a class="toc-backref" href="#id268" role="doc-backlink"><span class="section-number">6.3.2.2. </span>Anthropic</a><a class="headerlink" href="#anthropic" title="Permalink to this heading">¶</a></h4>
+<p>Anthropic adopts a framework based on <strong>AI Safety Levels (ASLs)</strong> <span id="id26">[<a class="reference internal" href="#id230" title="Anthropic. Anthropic's responsible scaling policy. Technical Report, Anthropic, 2024. URL: https://www-cdn.anthropic.com/1adf000c8f675958c2ee23805d91aaade1cd4613/responsible-scaling-policy.pdf.">Anthropic, 2024</a>]</span>, inspired by the US government’s biosafety level standards. ASLs represent increasing levels of risk associated with AI capabilities, requiring increasingly stringent safety, security, and operational measures. Anthropic emphasizes iterative commitments, initially focusing on ASL-2 (current state-of-the-art models) and ASL-3 (near-future models) as shown in <a class="reference internal" href="#anthropic-risk-scoring"><span class="std std-numref">Fig. 6.4</span></a>.</p>
 <figure class="align-center" id="anthropic-risk-scoring">
 <a class="reference internal image-reference" href="../_images/ant_score.png"><img alt="Anthropic's AI Safety Levels (ASLs) framework showing the gradation scale from &quot;low&quot; to &quot;critical&quot; model autonomy risk." src="../_images/ant_score.png" style="width: 75%;" /></a>
 <figcaption>
-<p><span class="caption-number">Fig. 5.4 </span><span class="caption-text">Anthropic’s AI Safety Levels (ASLs) framework showing the gradation scale from “low” to “critical” model autonomy risk.</span><a class="headerlink" href="#anthropic-risk-scoring" title="Permalink to this image">¶</a></p>
+<p><span class="caption-number">Fig. 6.4 </span><span class="caption-text">Anthropic’s AI Safety Levels (ASLs) framework showing the gradation scale from “low” to “critical” model autonomy risk.</span><a class="headerlink" href="#anthropic-risk-scoring" title="Permalink to this image">¶</a></p>
 </figcaption>
 </figure>
 <p><strong>ASL-2</strong></p>
@@ -552,12 +561,12 @@ <h4><a class="toc-backref" href="#id261" role="doc-backlink"><span class="sectio
 </ul>
 </section>
 <section id="google">
-<h4><a class="toc-backref" href="#id262" role="doc-backlink"><span class="section-number">5.3.2.3. </span>Google</a><a class="headerlink" href="#google" title="Permalink to this heading">¶</a></h4>
-<p>Google’s approach, as detailed in the <strong>Frontier Safety Framework</strong> <span id="id27">[<a class="reference internal" href="#id224" title="DeepMind. The frontier safety framework. Technical Report, DeepMind, 2024. URL: https://storage.googleapis.com/deepmind-media/DeepMind.com/Blog/introducing-the-frontier-safety-framework/fsf-technical-report.pdf.">DeepMind, 2024</a>]</span>, focuses on identifying and mitigating severe risks from powerful foundation models. They introduce the concept of <strong>Critical Capability Levels (CCLs)</strong>, representing capability thresholds where models, absent mitigation, may pose heightened risk.</p>
+<h4><a class="toc-backref" href="#id269" role="doc-backlink"><span class="section-number">6.3.2.3. </span>Google</a><a class="headerlink" href="#google" title="Permalink to this heading">¶</a></h4>
+<p>Google’s approach, as detailed in the <strong>Frontier Safety Framework</strong> <span id="id27">[<a class="reference internal" href="#id231" title="DeepMind. The frontier safety framework. Technical Report, DeepMind, 2024. URL: https://storage.googleapis.com/deepmind-media/DeepMind.com/Blog/introducing-the-frontier-safety-framework/fsf-technical-report.pdf.">DeepMind, 2024</a>]</span>, focuses on identifying and mitigating severe risks from powerful foundation models. They introduce the concept of <strong>Critical Capability Levels (CCLs)</strong>, representing capability thresholds where models, absent mitigation, may pose heightened risk.</p>
 <figure class="align-center" id="google-risk-scoring">
 <a class="reference internal image-reference" href="../_images/google_score.png"><img alt="Google's Frontier Safety Framework Risk Scoring" src="../_images/google_score.png" style="width: 65%;" /></a>
 <figcaption>
-<p><span class="caption-number">Fig. 5.5 </span><span class="caption-text">Google’s Frontier Safety Framework Risk Scoring <span id="id28">[<a class="reference internal" href="#id224" title="DeepMind. The frontier safety framework. Technical Report, DeepMind, 2024. URL: https://storage.googleapis.com/deepmind-media/DeepMind.com/Blog/introducing-the-frontier-safety-framework/fsf-technical-report.pdf.">DeepMind, 2024</a>]</span>.</span><a class="headerlink" href="#google-risk-scoring" title="Permalink to this image">¶</a></p>
+<p><span class="caption-number">Fig. 6.5 </span><span class="caption-text">Google’s Frontier Safety Framework Risk Scoring <span id="id28">[<a class="reference internal" href="#id231" title="DeepMind. The frontier safety framework. Technical Report, DeepMind, 2024. URL: https://storage.googleapis.com/deepmind-media/DeepMind.com/Blog/introducing-the-frontier-safety-framework/fsf-technical-report.pdf.">DeepMind, 2024</a>]</span>.</span><a class="headerlink" href="#google-risk-scoring" title="Permalink to this image">¶</a></p>
 </figcaption>
 </figure>
 <p>The framework identifies initial CCLs in the domains of autonomy, biosecurity, cybersecurity, and machine learning R&amp;D.  Key components of the framework include:</p>
@@ -570,23 +579,23 @@ <h4><a class="toc-backref" href="#id262" role="doc-backlink"><span class="sectio
 </section>
 </section>
 <section id="rubrics">
-<h3><a class="toc-backref" href="#id263" role="doc-backlink"><span class="section-number">5.3.3. </span>Rubrics</a><a class="headerlink" href="#rubrics" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id270" role="doc-backlink"><span class="section-number">6.3.3. </span>Rubrics</a><a class="headerlink" href="#rubrics" title="Permalink to this heading">¶</a></h3>
 <p>In order to quantify the safety of LLMs, AI safety rubrics have been developed, prominently by MLCommons and the Centre for the Governance of AI.</p>
 <section id="mlcommons-ai-safety-benchmark">
-<h4><a class="toc-backref" href="#id264" role="doc-backlink"><span class="section-number">5.3.3.1. </span>MLCommons AI Safety Benchmark</a><a class="headerlink" href="#mlcommons-ai-safety-benchmark" title="Permalink to this heading">¶</a></h4>
+<h4><a class="toc-backref" href="#id271" role="doc-backlink"><span class="section-number">6.3.3.1. </span>MLCommons AI Safety Benchmark</a><a class="headerlink" href="#mlcommons-ai-safety-benchmark" title="Permalink to this heading">¶</a></h4>
 <p>The MLCommons AI Safety Working Group has developed a comprehensive benchmark to assess safety risks in AI systems, with a particular focus on language models <span id="id29">[<a class="reference internal" href="#id125" title="Bertie Vidgen, Adarsh Agrawal, Ahmed M. Ahmed, Victor Akinwande, Namir Al-Nuaimi, Najla Alfaraj, Elie Alhajjar, Lora Aroyo, Trupti Bavalatti, Max Bartolo, Borhane Blili-Hamelin, Kurt Bollacker, Rishi Bomassani, Marisa Ferrara Boston, Siméon Campos, Kal Chakra, Canyu Chen, Cody Coleman, Zacharie Delpierre Coudert, Leon Derczynski, Debojyoti Dutta, Ian Eisenberg, James Ezick, Heather Frase, Brian Fuller, Ram Gandikota, Agasthya Gangavarapu, Ananya Gangavarapu, James Gealy, Rajat Ghosh, James Goel, Usman Gohar, Sujata Goswami, Scott A. Hale, Wiebke Hutiri, Joseph Marvin Imperial, Surgan Jandial, Nick Judd, Felix Juefei-Xu, Foutse Khomh, Bhavya Kailkhura, Hannah Rose Kirk, Kevin Klyman, Chris Knotz, Michael Kuchnik, Shachi H. Kumar, Srijan Kumar, Chris Lengerich, Bo Li, Zeyi Liao, Eileen Peters Long, Victor Lu, Sarah Luger, Yifan Mai, Priyanka Mary Mammen, Kelvin Manyeki, Sean McGregor, Virendra Mehta, Shafee Mohammed, Emanuel Moss, Lama Nachman, Dinesh Jinenhally Naganna, Amin Nikanjam, Besmira Nushi, Luis Oala, Iftach Orr, Alicia Parrish, Cigdem Patlak, William Pietri, Forough Poursabzi-Sangdeh, Eleonora Presani, Fabrizio Puletti, Paul Röttger, Saurav Sahay, Tim Santos, Nino Scherrer, Alice Schoenauer Sebag, Patrick Schramowski, Abolfazl Shahbazi, Vin Sharma, Xudong Shen, Vamsi Sistla, Leonard Tang, Davide Testuggine, Vithursan Thangarasa, Elizabeth Anne Watkins, Rebecca Weiss, Chris Welty, Tyler Wilbers, Adina Williams, Carole-Jean Wu, Poonam Yadav, Xianjun Yang, Yi Zeng, Wenhui Zhang, Fedor Zhdanov, Jiacheng Zhu, Percy Liang, Peter Mattson, and Joaquin Vanschoren. Introducing v0.5 of the ai safety benchmark from mlcommons. 2024. URL: https://arxiv.org/abs/2404.12241, arXiv:2404.12241.">Vidgen <em>et al.</em>, 2024</a>]</span>. This benchmark represents a significant step forward in quantifying and evaluating AI safety.</p>
 <p>The benchmark incorporates:</p>
 <ul class="simple">
 <li><p>A taxonomy of 13 hazard categories covering critical areas like violent crimes, hate speech, and child exploitation</p></li>
 <li><p>Test items and prompts designed to probe potentially harmful model behaviors</p></li>
 <li><p>Various interaction types to test model responses in different contexts</p></li>
-<li><p>An automated evaluation system powered by LlamaGuard <span id="id30">[<a class="reference internal" href="#id192" title="Meta AI. Llamaguard: llm-based input-output safeguard for human-ai conversations. Meta AI Research Publications, 2024. URL: https://ai.meta.com/research/publications/llama-guard-llm-based-input-output-safeguard-for-human-ai-conversations/.">AI, 2024</a>]</span></p></li>
+<li><p>An automated evaluation system powered by LlamaGuard <span id="id30">[<a class="reference internal" href="#id199" title="Meta-AI. Llamaguard: llm-based input-output safeguard for human-ai conversations. Meta AI Research Publications, 2024. URL: https://ai.meta.com/research/publications/llama-guard-llm-based-input-output-safeguard-for-human-ai-conversations/.">Meta-AI, 2024</a>]</span></p></li>
 </ul>
-<p>A leaderboard <span id="id31">[<a class="reference internal" href="#id167" title="MLCommons. Mlcommons ai illuminate benchmarks. 2024. A collection of standardized benchmarks for evaluating AI systems. URL: https://ailuminate.mlcommons.org/benchmarks/.">MLCommons, 2024</a>]</span> is published with benchmark results of common proprietary and open source models ranked by their safety scores. For instance, Claude 3.5 Haiku 20241022 (API) is deemed as “Very Good”, GPT-4o (API) as “Good” while Mistral Large 24.11 (API) shown in <a class="reference internal" href="#mlcommons-benchmark"><span class="std std-numref">Fig. 5.6</span></a> is deemed as “Fair”.</p>
+<p>A leaderboard <span id="id31">[<a class="reference internal" href="#id167" title="MLCommons. Mlcommons ai illuminate benchmarks. 2024. A collection of standardized benchmarks for evaluating AI systems. URL: https://ailuminate.mlcommons.org/benchmarks/.">MLCommons, 2024</a>]</span> is published with benchmark results of common proprietary and open source models ranked by their safety scores. For instance, Claude 3.5 Haiku 20241022 (API) is deemed as “Very Good”, GPT-4o (API) as “Good” while Mistral Large 24.11 (API) shown in <a class="reference internal" href="#mlcommons-benchmark"><span class="std std-numref">Fig. 6.6</span></a> is deemed as “Fair”.</p>
 <figure class="align-center" id="mlcommons-benchmark">
 <a class="reference internal image-reference" href="../_images/commons.png"><img alt="MLCommons AI Safety Benchmark" src="../_images/commons.png" style="width: 65%;" /></a>
 <figcaption>
-<p><span class="caption-number">Fig. 5.6 </span><span class="caption-text">MLCommons AI Safety Benchmark Results for Mistral Large 24.11 (API) <span id="id32">[<a class="reference internal" href="#id125" title="Bertie Vidgen, Adarsh Agrawal, Ahmed M. Ahmed, Victor Akinwande, Namir Al-Nuaimi, Najla Alfaraj, Elie Alhajjar, Lora Aroyo, Trupti Bavalatti, Max Bartolo, Borhane Blili-Hamelin, Kurt Bollacker, Rishi Bomassani, Marisa Ferrara Boston, Siméon Campos, Kal Chakra, Canyu Chen, Cody Coleman, Zacharie Delpierre Coudert, Leon Derczynski, Debojyoti Dutta, Ian Eisenberg, James Ezick, Heather Frase, Brian Fuller, Ram Gandikota, Agasthya Gangavarapu, Ananya Gangavarapu, James Gealy, Rajat Ghosh, James Goel, Usman Gohar, Sujata Goswami, Scott A. Hale, Wiebke Hutiri, Joseph Marvin Imperial, Surgan Jandial, Nick Judd, Felix Juefei-Xu, Foutse Khomh, Bhavya Kailkhura, Hannah Rose Kirk, Kevin Klyman, Chris Knotz, Michael Kuchnik, Shachi H. Kumar, Srijan Kumar, Chris Lengerich, Bo Li, Zeyi Liao, Eileen Peters Long, Victor Lu, Sarah Luger, Yifan Mai, Priyanka Mary Mammen, Kelvin Manyeki, Sean McGregor, Virendra Mehta, Shafee Mohammed, Emanuel Moss, Lama Nachman, Dinesh Jinenhally Naganna, Amin Nikanjam, Besmira Nushi, Luis Oala, Iftach Orr, Alicia Parrish, Cigdem Patlak, William Pietri, Forough Poursabzi-Sangdeh, Eleonora Presani, Fabrizio Puletti, Paul Röttger, Saurav Sahay, Tim Santos, Nino Scherrer, Alice Schoenauer Sebag, Patrick Schramowski, Abolfazl Shahbazi, Vin Sharma, Xudong Shen, Vamsi Sistla, Leonard Tang, Davide Testuggine, Vithursan Thangarasa, Elizabeth Anne Watkins, Rebecca Weiss, Chris Welty, Tyler Wilbers, Adina Williams, Carole-Jean Wu, Poonam Yadav, Xianjun Yang, Yi Zeng, Wenhui Zhang, Fedor Zhdanov, Jiacheng Zhu, Percy Liang, Peter Mattson, and Joaquin Vanschoren. Introducing v0.5 of the ai safety benchmark from mlcommons. 2024. URL: https://arxiv.org/abs/2404.12241, arXiv:2404.12241.">Vidgen <em>et al.</em>, 2024</a>]</span>.</span><a class="headerlink" href="#mlcommons-benchmark" title="Permalink to this image">¶</a></p>
+<p><span class="caption-number">Fig. 6.6 </span><span class="caption-text">MLCommons AI Safety Benchmark Results for Mistral Large 24.11 (API) <span id="id32">[<a class="reference internal" href="#id125" title="Bertie Vidgen, Adarsh Agrawal, Ahmed M. Ahmed, Victor Akinwande, Namir Al-Nuaimi, Najla Alfaraj, Elie Alhajjar, Lora Aroyo, Trupti Bavalatti, Max Bartolo, Borhane Blili-Hamelin, Kurt Bollacker, Rishi Bomassani, Marisa Ferrara Boston, Siméon Campos, Kal Chakra, Canyu Chen, Cody Coleman, Zacharie Delpierre Coudert, Leon Derczynski, Debojyoti Dutta, Ian Eisenberg, James Ezick, Heather Frase, Brian Fuller, Ram Gandikota, Agasthya Gangavarapu, Ananya Gangavarapu, James Gealy, Rajat Ghosh, James Goel, Usman Gohar, Sujata Goswami, Scott A. Hale, Wiebke Hutiri, Joseph Marvin Imperial, Surgan Jandial, Nick Judd, Felix Juefei-Xu, Foutse Khomh, Bhavya Kailkhura, Hannah Rose Kirk, Kevin Klyman, Chris Knotz, Michael Kuchnik, Shachi H. Kumar, Srijan Kumar, Chris Lengerich, Bo Li, Zeyi Liao, Eileen Peters Long, Victor Lu, Sarah Luger, Yifan Mai, Priyanka Mary Mammen, Kelvin Manyeki, Sean McGregor, Virendra Mehta, Shafee Mohammed, Emanuel Moss, Lama Nachman, Dinesh Jinenhally Naganna, Amin Nikanjam, Besmira Nushi, Luis Oala, Iftach Orr, Alicia Parrish, Cigdem Patlak, William Pietri, Forough Poursabzi-Sangdeh, Eleonora Presani, Fabrizio Puletti, Paul Röttger, Saurav Sahay, Tim Santos, Nino Scherrer, Alice Schoenauer Sebag, Patrick Schramowski, Abolfazl Shahbazi, Vin Sharma, Xudong Shen, Vamsi Sistla, Leonard Tang, Davide Testuggine, Vithursan Thangarasa, Elizabeth Anne Watkins, Rebecca Weiss, Chris Welty, Tyler Wilbers, Adina Williams, Carole-Jean Wu, Poonam Yadav, Xianjun Yang, Yi Zeng, Wenhui Zhang, Fedor Zhdanov, Jiacheng Zhu, Percy Liang, Peter Mattson, and Joaquin Vanschoren. Introducing v0.5 of the ai safety benchmark from mlcommons. 2024. URL: https://arxiv.org/abs/2404.12241, arXiv:2404.12241.">Vidgen <em>et al.</em>, 2024</a>]</span>.</span><a class="headerlink" href="#mlcommons-benchmark" title="Permalink to this image">¶</a></p>
 </figcaption>
 </figure>
 <p>The benchmark uses the following scoring system to evaluate model safety:</p>
@@ -600,15 +609,15 @@ <h4><a class="toc-backref" href="#id264" role="doc-backlink"><span class="sectio
 <p>The goal is to establish standardized metrics for measuring AI system safety and accelerate research into safety mitigation strategies.</p>
 </section>
 <section id="centre-for-the-governance-of-ai-rubric">
-<h4><a class="toc-backref" href="#id265" role="doc-backlink"><span class="section-number">5.3.3.2. </span>Centre for the Governance of AI Rubric</a><a class="headerlink" href="#centre-for-the-governance-of-ai-rubric" title="Permalink to this heading">¶</a></h4>
-<p>The Centre for the Governance of AI has developed a rubric for evaluating AI safety frameworks <span id="id33">[<a class="reference internal" href="#id217" title="Jide Alaga, Jonas Schuett, and Markus Anderljung. A grading rubric for ai safety frameworks. 2024. URL: https://arxiv.org/abs/2409.08751, arXiv:2409.08751.">Alaga <em>et al.</em>, 2024</a>]</span>. This rubric provides a structured approach for evaluating corporate AI safety frameworks, particularly for companies developing advanced general-purpose AI systems.</p>
+<h4><a class="toc-backref" href="#id272" role="doc-backlink"><span class="section-number">6.3.3.2. </span>Centre for the Governance of AI Rubric</a><a class="headerlink" href="#centre-for-the-governance-of-ai-rubric" title="Permalink to this heading">¶</a></h4>
+<p>The Centre for the Governance of AI has developed a rubric for evaluating AI safety frameworks <span id="id33">[<a class="reference internal" href="#id224" title="Jide Alaga, Jonas Schuett, and Markus Anderljung. A grading rubric for ai safety frameworks. 2024. URL: https://arxiv.org/abs/2409.08751, arXiv:2409.08751.">Alaga <em>et al.</em>, 2024</a>]</span>. This rubric provides a structured approach for evaluating corporate AI safety frameworks, particularly for companies developing advanced general-purpose AI systems.</p>
 <figure class="align-center" id="centerai">
 <a class="reference internal image-reference" href="../_images/centerai.png"><img alt="Centre for the Governance of AI Rubric" src="../_images/centerai.png" style="width: 65%;" /></a>
 <figcaption>
-<p><span class="caption-number">Fig. 5.7 </span><span class="caption-text">Sample grading by the Centre for the Governance of AI Rubric <span id="id34">[<a class="reference internal" href="#id217" title="Jide Alaga, Jonas Schuett, and Markus Anderljung. A grading rubric for ai safety frameworks. 2024. URL: https://arxiv.org/abs/2409.08751, arXiv:2409.08751.">Alaga <em>et al.</em>, 2024</a>]</span>.</span><a class="headerlink" href="#centerai" title="Permalink to this image">¶</a></p>
+<p><span class="caption-number">Fig. 6.7 </span><span class="caption-text">Sample grading by the Centre for the Governance of AI Rubric <span id="id34">[<a class="reference internal" href="#id224" title="Jide Alaga, Jonas Schuett, and Markus Anderljung. A grading rubric for ai safety frameworks. 2024. URL: https://arxiv.org/abs/2409.08751, arXiv:2409.08751.">Alaga <em>et al.</em>, 2024</a>]</span>.</span><a class="headerlink" href="#centerai" title="Permalink to this image">¶</a></p>
 </figcaption>
 </figure>
-<p><a class="reference internal" href="#centerai"><span class="std std-numref">Fig. 5.7</span></a> shows a sample grading to illustrate the evaluation criteria and quality tiers. The rubric evaluates safety frameworks across three key dimensions:</p>
+<p><a class="reference internal" href="#centerai"><span class="std std-numref">Fig. 6.7</span></a> shows a sample grading to illustrate the evaluation criteria and quality tiers. The rubric evaluates safety frameworks across three key dimensions:</p>
 <ol class="arabic simple">
 <li><p>Effectiveness</p></li>
 <li><p>Adherence</p></li>
@@ -618,8 +627,8 @@ <h4><a class="toc-backref" href="#id265" role="doc-backlink"><span class="sectio
 </section>
 </section>
 <section id="porquoi">
-<h3><a class="toc-backref" href="#id266" role="doc-backlink"><span class="section-number">5.3.4. </span>Porquoi</a><a class="headerlink" href="#porquoi" title="Permalink to this heading">¶</a></h3>
-<p>Do we need regulations specifically for LLMs? That was the question posed by Oxford University researchers in <span id="id35">[<a class="reference internal" href="#id219" title="Sandra Wachter, Brent Mittelstadt, and Chris Russell. Do large language models have a legal duty to tell the truth? Royal Society Open Science, 11(8):240197, 2024. URL: https://royalsocietypublishing.org/doi/abs/10.1098/rsos.240197, arXiv:https://royalsocietypublishing.org/doi/pdf/10.1098/rsos.240197, doi:10.1098/rsos.240197.">Wachter <em>et al.</em>, 2024</a>]</span>.</p>
+<h3><a class="toc-backref" href="#id273" role="doc-backlink"><span class="section-number">6.3.4. </span>Porquoi</a><a class="headerlink" href="#porquoi" title="Permalink to this heading">¶</a></h3>
+<p>Do we need regulations specifically for LLMs? That was the question posed by Oxford University researchers in <span id="id35">[<a class="reference internal" href="#id226" title="Sandra Wachter, Brent Mittelstadt, and Chris Russell. Do large language models have a legal duty to tell the truth? Royal Society Open Science, 11(8):240197, 2024. URL: https://royalsocietypublishing.org/doi/abs/10.1098/rsos.240197, arXiv:https://royalsocietypublishing.org/doi/pdf/10.1098/rsos.240197, doi:10.1098/rsos.240197.">Wachter <em>et al.</em>, 2024</a>]</span>.</p>
 <p>Pro-regulation arguments highlight some of the key risks and harms associated with LLMs we have discussed in this chapter:</p>
 <ul class="simple">
 <li><p><strong>LLMs can generate harmful content:</strong> As explored in the example of a stealth edit, LLMs can be manipulated to produce outputs that promote violence, hate speech, or misinformation. Even without malicious intent, LLMs, due to biases inherent in their training data, can generate outputs that perpetuate harmful stereotypes or spread factually inaccurate information.</p></li>
@@ -636,17 +645,17 @@ <h3><a class="toc-backref" href="#id266" role="doc-backlink"><span class="sectio
 </section>
 </section>
 <section id="approaches">
-<h2><a class="toc-backref" href="#id267" role="doc-backlink"><span class="section-number">5.4. </span>Approaches</a><a class="headerlink" href="#approaches" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id274" role="doc-backlink"><span class="section-number">6.4. </span>Approaches</a><a class="headerlink" href="#approaches" title="Permalink to this heading">¶</a></h2>
 <p>Several approaches and techniques are being developed to help effectively implement AI/LLM Safety alignment.</p>
 <section id="red-teaming">
-<h3><a class="toc-backref" href="#id268" role="doc-backlink"><span class="section-number">5.4.1. </span>Red Teaming</a><a class="headerlink" href="#red-teaming" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id275" role="doc-backlink"><span class="section-number">6.4.1. </span>Red Teaming</a><a class="headerlink" href="#red-teaming" title="Permalink to this heading">¶</a></h3>
 <p>Red teaming is a critical security practice adapted from cybersecurity for evaluating LLMs. Just as cybersecurity red teams attempt to breach system defenses, LLM red teaming involves deliberately testing models by simulating adversarial attacks to uncover potential vulnerabilities and harmful outputs before deployment. We can outline LLMs Red teaming around three key aspects:</p>
 <ol class="arabic simple">
 <li><p>The primary purpose is to systematically identify potential vulnerabilities by crafting prompts designed to elicit harmful outputs, including biased content, misinformation, or sensitive data exposure. Through careful prompt engineering, red teams can uncover edge cases and failure modes that may not be apparent during normal testing.</p></li>
 <li><p>The process relies on a dedicated team of security experts and AI researchers who develop sophisticated adversarial scenarios. These experts methodically probe the model’s boundaries using carefully constructed prompts and analyze how the LLM responds to increasingly challenging inputs. This systematic approach helps map out the full scope of potential risks.</p></li>
 <li><p>The key benefit is that red teaming enables proactive identification and remediation of safety issues before public deployment. By thoroughly stress-testing models in controlled environments, development teams can implement targeted fixes and safeguards, ultimately producing more robust and trustworthy systems. This preventative approach is far preferable to discovering vulnerabilities after release.</p></li>
 </ol>
-<p>A particularly powerful approach involves using one language model (the “red LM”) to systematically probe and test another target model <span id="id36">[<a class="reference internal" href="#id225" title="Ethan Perez, Saffron Huang, Francis Song, Trevor Cai, Roman Ring, John Aslanides, Amelia Glaese, Nat McAleese, and Geoffrey Irving. Red teaming language models with language models. 2022. URL: https://arxiv.org/abs/2202.03286, arXiv:2202.03286.">Perez <em>et al.</em>, 2022</a>]</span>. The red LM generates diverse test cases specifically crafted to elicit problematic behaviors, while a classifier evaluates the target model’s responses for specific categories of harm.</p>
+<p>A particularly powerful approach involves using one language model (the “red LM”) to systematically probe and test another target model <span id="id36">[<a class="reference internal" href="#id232" title="Ethan Perez, Saffron Huang, Francis Song, Trevor Cai, Roman Ring, John Aslanides, Amelia Glaese, Nat McAleese, and Geoffrey Irving. Red teaming language models with language models. 2022. URL: https://arxiv.org/abs/2202.03286, arXiv:2202.03286.">Perez <em>et al.</em>, 2022</a>]</span>. The red LM generates diverse test cases specifically crafted to elicit problematic behaviors, while a classifier evaluates the target model’s responses for specific categories of harm.</p>
 <p>This LLM-based red teaming process consists of three main components:</p>
 <ol class="arabic simple">
 <li><p><strong>Systematic Test Generation</strong>: The red LM creates a wide array of test cases using multiple techniques:</p>
@@ -665,7 +674,7 @@ <h3><a class="toc-backref" href="#id268" role="doc-backlink"><span class="sectio
 </ul>
 </li>
 </ol>
-<p>These varied approaches help ensure comprehensive coverage across different types of potential <a class="reference external" href="http://vulnerabilities.In">vulnerabilities.In</a> this research <span id="id37">[<a class="reference internal" href="#id225" title="Ethan Perez, Saffron Huang, Francis Song, Trevor Cai, Roman Ring, John Aslanides, Amelia Glaese, Nat McAleese, and Geoffrey Irving. Red teaming language models with language models. 2022. URL: https://arxiv.org/abs/2202.03286, arXiv:2202.03286.">Perez <em>et al.</em>, 2022</a>]</span>, a 280B parameter  “red-LM” uncovered numerous concerning behaviors:</p>
+<p>These varied approaches help ensure comprehensive coverage across different types of potential <a class="reference external" href="http://vulnerabilities.In">vulnerabilities.In</a> this research <span id="id37">[<a class="reference internal" href="#id232" title="Ethan Perez, Saffron Huang, Francis Song, Trevor Cai, Roman Ring, John Aslanides, Amelia Glaese, Nat McAleese, and Geoffrey Irving. Red teaming language models with language models. 2022. URL: https://arxiv.org/abs/2202.03286, arXiv:2202.03286.">Perez <em>et al.</em>, 2022</a>]</span>, a 280B parameter  “red-LM” uncovered numerous concerning behaviors:</p>
 <ul class="simple">
 <li><p>Generation of offensive content including discriminatory statements and explicit material</p></li>
 <li><p>Unauthorized disclosure of training data including personal information</p></li>
@@ -675,8 +684,8 @@ <h3><a class="toc-backref" href="#id268" role="doc-backlink"><span class="sectio
 <p>While LLM-based red teaming offers significant advantages over manual testing in terms of scale and systematic coverage, it also has important limitations. The red LM itself may have biases that affect test case generation, and results require careful interpretation within broader context. Further, Red teaming should be viewed as one component of a comprehensive safety framework rather than a complete solution.</p>
 </section>
 <section id="constitutional-ai">
-<h3><a class="toc-backref" href="#id269" role="doc-backlink"><span class="section-number">5.4.2. </span>Constitutional AI</a><a class="headerlink" href="#constitutional-ai" title="Permalink to this heading">¶</a></h3>
-<p>Anthropic has developed Constitutional AI (CAI) <span id="id38">[<a class="reference internal" href="#id227" title="Amanda Askell, Yuntao Bai, Anna Chen, Deep Ganguli, Danny Hernandez, Jared Kaplan, Jackson Kernion, Ben Mann, Catherine Olsson, and Paul Christiano. Constitutional ai: harmlessness from ai feedback. 2023. URL: https://www.anthropic.com/research/constitutional-ai-harmlessness-from-ai-feedback.">Askell <em>et al.</em>, 2023</a>]</span> as a novel approach to enhance the safety of large language models (LLMs). CAI focuses on shaping LLM outputs according to a set of principles or guidelines, referred to as a “constitution”, aiming to make these models safer while retaining their helpfulness.</p>
+<h3><a class="toc-backref" href="#id276" role="doc-backlink"><span class="section-number">6.4.2. </span>Constitutional AI</a><a class="headerlink" href="#constitutional-ai" title="Permalink to this heading">¶</a></h3>
+<p>Anthropic has developed Constitutional AI (CAI) <span id="id38">[<a class="reference internal" href="#id234" title="Amanda Askell, Yuntao Bai, Anna Chen, Deep Ganguli, Danny Hernandez, Jared Kaplan, Jackson Kernion, Ben Mann, Catherine Olsson, and Paul Christiano. Constitutional ai: harmlessness from ai feedback. 2023. URL: https://www.anthropic.com/research/constitutional-ai-harmlessness-from-ai-feedback.">Askell <em>et al.</em>, 2023</a>]</span> as a novel approach to enhance the safety of large language models (LLMs). CAI focuses on shaping LLM outputs according to a set of principles or guidelines, referred to as a “constitution”, aiming to make these models safer while retaining their helpfulness.</p>
 <p>Here’s how Anthropic utilises CAI to promote LLM safety:</p>
 <ul class="simple">
 <li><p><strong>Minimising Harm Through Self-Critique:</strong>  Instead of relying solely on human feedback for training, Anthropic leverages the LLM’s own capabilities to critique and revise its outputs based on the principles enshrined in its constitution. This approach is termed “Reinforcement Learning from AI Feedback (RLAIF)”.</p></li>
@@ -684,19 +693,19 @@ <h3><a class="toc-backref" href="#id269" role="doc-backlink"><span class="sectio
 <li><p><strong>Enhancing Transparency and Scalability:</strong> Anthropic highlights that encoding safety principles into a “constitution” increases transparency in the model’s decision-making process, allowing users and regulators to better understand how the LLM operates.  Additionally, CAI proves to be more scalable and efficient compared to RLHF, requiring fewer human feedback labels and reducing the exposure of human reviewers to potentially harmful content.</p></li>
 </ul>
 <p>Anthropic’s research indicates that CAI leads to LLMs that are both more harmless and helpful. These models are less evasive, engage with user requests, and are more likely to explain their reasoning when refusing unsafe or unethical requests.</p>
-<p>The key insight as proposed by Anthropic is that Constitutional RL manages to break the traditional trade-off between helpfulness and harmlessness. While standard RLHF models tend to become less helpful as they become more harmless (often by becoming more evasive), Constitutional RL achieves high scores in both dimensions simultaneously as demonstrated in <a class="reference internal" href="#anthropic-cai-tradeoff"><span class="std std-numref">Fig. 5.8</span></a>.</p>
+<p>The key insight as proposed by Anthropic is that Constitutional RL manages to break the traditional trade-off between helpfulness and harmlessness. While standard RLHF models tend to become less helpful as they become more harmless (often by becoming more evasive), Constitutional RL achieves high scores in both dimensions simultaneously as demonstrated in <a class="reference internal" href="#anthropic-cai-tradeoff"><span class="std std-numref">Fig. 6.8</span></a>.</p>
 <figure class="align-center" id="anthropic-cai-tradeoff">
 <a class="reference internal image-reference" href="../_images/cai.png"><img alt="Anthropic's Constitutional AI (CAI) achieves high scores in both helpfulness and harmlessness." src="../_images/cai.png" style="width: 70%;" /></a>
 <figcaption>
-<p><span class="caption-number">Fig. 5.8 </span><span class="caption-text">Anthropic’s Constitutional AI (CAI) achieves high scores in both helpfulness and harmlessness <span id="id39">[<a class="reference internal" href="#id227" title="Amanda Askell, Yuntao Bai, Anna Chen, Deep Ganguli, Danny Hernandez, Jared Kaplan, Jackson Kernion, Ben Mann, Catherine Olsson, and Paul Christiano. Constitutional ai: harmlessness from ai feedback. 2023. URL: https://www.anthropic.com/research/constitutional-ai-harmlessness-from-ai-feedback.">Askell <em>et al.</em>, 2023</a>]</span>.</span><a class="headerlink" href="#anthropic-cai-tradeoff" title="Permalink to this image">¶</a></p>
+<p><span class="caption-number">Fig. 6.8 </span><span class="caption-text">Anthropic’s Constitutional AI (CAI) achieves high scores in both helpfulness and harmlessness <span id="id39">[<a class="reference internal" href="#id234" title="Amanda Askell, Yuntao Bai, Anna Chen, Deep Ganguli, Danny Hernandez, Jared Kaplan, Jackson Kernion, Ben Mann, Catherine Olsson, and Paul Christiano. Constitutional ai: harmlessness from ai feedback. 2023. URL: https://www.anthropic.com/research/constitutional-ai-harmlessness-from-ai-feedback.">Askell <em>et al.</em>, 2023</a>]</span>.</span><a class="headerlink" href="#anthropic-cai-tradeoff" title="Permalink to this image">¶</a></p>
 </figcaption>
 </figure>
 <p>Anthropic believes that CAI is a promising avenue for building safer and more trustworthy AI systems, moving towards a future where AI aligns more closely with human values and societal needs.</p>
 </section>
 <section id="explainable-ai-xai">
-<h3><a class="toc-backref" href="#id270" role="doc-backlink"><span class="section-number">5.4.3. </span>Explainable AI (XAI)</a><a class="headerlink" href="#explainable-ai-xai" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id277" role="doc-backlink"><span class="section-number">6.4.3. </span>Explainable AI (XAI)</a><a class="headerlink" href="#explainable-ai-xai" title="Permalink to this heading">¶</a></h3>
 <p>XAI techniques aim to make the decision-making processes of LLMs more transparent and understandable. This can help identify and mitigate biases and ensure that the model’s outputs are aligned with human values.</p>
-<p>XAI can contribute to LLM safety in multiple ways, including <span id="id40">[<a class="reference internal" href="#id226" title="Erik Cambria, Lorenzo Malandri, Fabio Mercorio, Navid Nobani, and Andrea Seveso. Xai meets llms: a survey of the relation between explainable ai and large language models. 2024. URL: https://arxiv.org/abs/2407.15248, arXiv:2407.15248.">Cambria <em>et al.</em>, 2024</a>]</span>:</p>
+<p>XAI can contribute to LLM safety in multiple ways, including <span id="id40">[<a class="reference internal" href="#id233" title="Erik Cambria, Lorenzo Malandri, Fabio Mercorio, Navid Nobani, and Andrea Seveso. Xai meets llms: a survey of the relation between explainable ai and large language models. 2024. URL: https://arxiv.org/abs/2407.15248, arXiv:2407.15248.">Cambria <em>et al.</em>, 2024</a>]</span>:</p>
 <ul class="simple">
 <li><p><strong>Identifying and Mitigating Bias:</strong> LLMs can inherit biases present in their vast training data, leading to unfair or discriminatory outputs.  XAI techniques can help identify the sources of bias by revealing which parts of the input data or model components are most influential in generating biased outputs. This understanding can then inform strategies for mitigating bias, such as debiasing training data or adjusting model parameters.</p></li>
 <li><p><strong>Detecting and Addressing Hallucinations:</strong> LLMs can generate outputs that sound plausible but are factually incorrect or nonsensical, a phenomenon known as “hallucination.”  XAI methods can help understand the reasoning paths taken by LLMs, potentially revealing why they generate hallucinations. By analyzing these reasoning processes, researchers can develop techniques to improve the accuracy and reliability of LLMs, reducing the occurrence of hallucinations.</p></li>
@@ -706,16 +715,16 @@ <h3><a class="toc-backref" href="#id270" role="doc-backlink"><span class="sectio
 </section>
 </section>
 <section id="designing-a-safety-plan">
-<h2><a class="toc-backref" href="#id271" role="doc-backlink"><span class="section-number">5.5. </span>Designing a Safety Plan</a><a class="headerlink" href="#designing-a-safety-plan" title="Permalink to this heading">¶</a></h2>
-<p>Building safe and reliable AI systems requires a comprehensive safety plan that addresses potential risks and establishes clear guidelines for development and deployment. This section outlines a structured approach to designing such a plan, breaking down the process into key phases from initial policy definition through implementation and monitoring as depicted in <a class="reference internal" href="#safety-plan"><span class="std std-numref">Fig. 5.9</span></a>.</p>
+<h2><a class="toc-backref" href="#id278" role="doc-backlink"><span class="section-number">6.5. </span>Designing a Safety Plan</a><a class="headerlink" href="#designing-a-safety-plan" title="Permalink to this heading">¶</a></h2>
+<p>Building safe and reliable AI systems requires a comprehensive safety plan that addresses potential risks and establishes clear guidelines for development and deployment. This section outlines a structured approach to designing such a plan, breaking down the process into key phases from initial policy definition through implementation and monitoring as depicted in <a class="reference internal" href="#safety-plan"><span class="std std-numref">Fig. 6.9</span></a>.</p>
 <figure class="align-center" id="safety-plan">
 <a class="reference internal image-reference" href="../_images/design.svg"><img alt="Safety Plan Design Phases" src="../_images/design.svg" width="80%" /></a>
 <figcaption>
-<p><span class="caption-number">Fig. 5.9 </span><span class="caption-text">Safety Plan Design Phases.</span><a class="headerlink" href="#safety-plan" title="Permalink to this image">¶</a></p>
+<p><span class="caption-number">Fig. 6.9 </span><span class="caption-text">Safety Plan Design Phases.</span><a class="headerlink" href="#safety-plan" title="Permalink to this image">¶</a></p>
 </figcaption>
 </figure>
 <section id="phase-1-policy-definition">
-<h3><a class="toc-backref" href="#id272" role="doc-backlink"><span class="section-number">5.5.1. </span>Phase 1. Policy Definition</a><a class="headerlink" href="#phase-1-policy-definition" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id279" role="doc-backlink"><span class="section-number">6.5.1. </span>Phase 1. Policy Definition</a><a class="headerlink" href="#phase-1-policy-definition" title="Permalink to this heading">¶</a></h3>
 <p>When designing a safety plan, it is essential to consider establishing a policy that clarifies the definition of safety within the context of the company, its users, and stakeholders. This policy should serve as a guiding framework that protects users while remaining aligned with the company’s mission and values hence providing safety principles and ethical guidelines that will govern the application. Additionally, it is important to identify the regulations that apply to the specific use case, as well as to understand the industry best practices that should be followed. Finally, determining the organization’s risk tolerance is crucial in shaping the overall safety strategy.</p>
 <p><strong>Questions to Ask:</strong></p>
 <ul class="simple">
@@ -747,7 +756,7 @@ <h3><a class="toc-backref" href="#id272" role="doc-backlink"><span class="sectio
 </ul>
 </section>
 <section id="phase-2-user-research-risk-identification">
-<h3><a class="toc-backref" href="#id273" role="doc-backlink"><span class="section-number">5.5.2. </span>Phase 2. User Research &amp; Risk Identification</a><a class="headerlink" href="#phase-2-user-research-risk-identification" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id280" role="doc-backlink"><span class="section-number">6.5.2. </span>Phase 2. User Research &amp; Risk Identification</a><a class="headerlink" href="#phase-2-user-research-risk-identification" title="Permalink to this heading">¶</a></h3>
 <p>When considering user safety, it is essential to identify who the users are and understand their needs. Ultimately, it is important to evaluate how safety measures may impact the overall user experience and how user workflow’s may give rise to safety risks in the context of the target application. Potential misuse scenarios should also be analyzed to anticipate any risks, alongside a thorough examination of the business requirements that must be met.</p>
 <p><strong>Questions to Ask:</strong></p>
 <ul class="simple">
@@ -779,7 +788,7 @@ <h3><a class="toc-backref" href="#id273" role="doc-backlink"><span class="sectio
 </ul>
 </section>
 <section id="phase-3-evaluation-framework">
-<h3><a class="toc-backref" href="#id274" role="doc-backlink"><span class="section-number">5.5.3. </span>Phase 3. Evaluation Framework</a><a class="headerlink" href="#phase-3-evaluation-framework" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id281" role="doc-backlink"><span class="section-number">6.5.3. </span>Phase 3. Evaluation Framework</a><a class="headerlink" href="#phase-3-evaluation-framework" title="Permalink to this heading">¶</a></h3>
 <p>Key considerations in establishing an evaluation framework for safety include defining the metrics that will determine safety success, identifying the datasets that will be utilized for evaluation, and determining the relevant benchmarks that will guide the assessment process. Additionally, it is crucial to establish a method for measuring the trade-offs between safety and user experience, ensuring that both aspects are adequately addressed in the product development lifecycle.</p>
 <p><strong>Questions to Ask:</strong></p>
 <ul class="simple">
@@ -809,7 +818,7 @@ <h3><a class="toc-backref" href="#id274" role="doc-backlink"><span class="sectio
 </ul>
 </section>
 <section id="phase-4-safety-architecture-design">
-<h3><a class="toc-backref" href="#id275" role="doc-backlink"><span class="section-number">5.5.4. </span>Phase 4. Safety Architecture Design</a><a class="headerlink" href="#phase-4-safety-architecture-design" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id282" role="doc-backlink"><span class="section-number">6.5.4. </span>Phase 4. Safety Architecture Design</a><a class="headerlink" href="#phase-4-safety-architecture-design" title="Permalink to this heading">¶</a></h3>
 <p>When designing a safety architecture, it is essential to consider the integration of safety components into the overall system architecture. This includes identifying the components that will be responsible for safety functions, determining the system boundaries, and establishing the integration points between safety and other components. Additionally, it is crucial to consider the performance requirements and scalability needs of the safety system, ensuring that it can handle the expected load and maintain a high level of reliability.</p>
 <p><strong>Questions to Ask:</strong></p>
 <ul class="simple">
@@ -839,7 +848,7 @@ <h3><a class="toc-backref" href="#id275" role="doc-backlink"><span class="sectio
 </ul>
 </section>
 <section id="phase-5-implementation-tools-selection">
-<h3><a class="toc-backref" href="#id276" role="doc-backlink"><span class="section-number">5.5.5. </span>Phase 5. Implementation &amp; Tools Selection</a><a class="headerlink" href="#phase-5-implementation-tools-selection" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id283" role="doc-backlink"><span class="section-number">6.5.5. </span>Phase 5. Implementation &amp; Tools Selection</a><a class="headerlink" href="#phase-5-implementation-tools-selection" title="Permalink to this heading">¶</a></h3>
 <p>When selecting tools for implementation, it is crucial to consider the combination that best meets the specific needs of the project given business and safety requirements as well as the design of the safety architecture. Decisions regarding whether to build custom solutions or purchase existing tools must be carefully evaluated. Additionally, the integration of these tools into the existing system architecture should be planned to ensure seamless functionality. Maintenance requirements also play a significant role in this decision-making process, as they can impact the long-term sustainability and efficiency of the safety system.</p>
 <p><strong>Questions to Ask:</strong></p>
 <ul class="simple">
@@ -869,7 +878,7 @@ <h3><a class="toc-backref" href="#id276" role="doc-backlink"><span class="sectio
 </ul>
 </section>
 <section id="phase-6-go-to-market">
-<h3><a class="toc-backref" href="#id277" role="doc-backlink"><span class="section-number">5.5.6. </span>Phase 6. Go-to-Market</a><a class="headerlink" href="#phase-6-go-to-market" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id284" role="doc-backlink"><span class="section-number">6.5.6. </span>Phase 6. Go-to-Market</a><a class="headerlink" href="#phase-6-go-to-market" title="Permalink to this heading">¶</a></h3>
 <p>Monitoring safety performance is essential to ensure that the implemented measures are effective and responsive to emerging threats. Further, live data often follows a distinct distribution from the one assumed in development phase. This should be monitored in order to allow for re-evaluation of pre-launch assumptions as well as to retrofit live data into models in use if applicable for continued enhanced performance.</p>
 <p>Establishing clear incident response procedures is crucial for addressing any safety issues that may arise promptly and efficiently. Additionally, a robust strategy for handling updates must be in place to adapt to new challenges and improve system resilience, particularly when underlying LLM-based components often suffer from continuous updates.</p>
 <p><strong>Questions to Ask:</strong></p>
@@ -902,7 +911,7 @@ <h3><a class="toc-backref" href="#id277" role="doc-backlink"><span class="sectio
 </ul>
 </section>
 <section id="common-pitfalls">
-<h3><a class="toc-backref" href="#id278" role="doc-backlink"><span class="section-number">5.5.7. </span>Common Pitfalls</a><a class="headerlink" href="#common-pitfalls" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id285" role="doc-backlink"><span class="section-number">6.5.7. </span>Common Pitfalls</a><a class="headerlink" href="#common-pitfalls" title="Permalink to this heading">¶</a></h3>
 <p><strong>Policy Neglect.</strong> A significant issue that arises when implementation begins without clear safety policies. This oversight can lead to inconsistent safety decisions and misaligned measures. A common consequence is having a “moving target”. Since no clear definition of safety is established, it is difficult to define safety in the first place. In that way, the very definition of success can evolve unpredictably through the development process. To mitigate this risk, it is essential to establish a comprehensive policy that serves as a guiding North Star for safety-related efforts.</p>
 <p><strong>Late Evals.</strong> Another common pitfall is late evaluation planning, which occurs when the design of the evaluation framework is postponed until after implementation. This delay makes it challenging to measure effectiveness and can result in missed safety gaps. To address this, the evaluation framework should be designed early in the process and integrated throughout the development cycle.</p>
 <p><strong>Weak Evals.</strong> It is common to begin with simple evaluations that focus on a single dimension of safety, and that’s a good approach: start simple, iterate, learn, improve. However, the real mistake occurs when these initial checks are not evolved throughout the development cycle. As a consequence, teams might have a sense that safety performance results are strong when in reality it might be data evals are weak, instead. Before moving to production, it is crucial to establish well-balanced datasets that represent safety risks in a nuanced manner better representing real-world user scenarios.</p>
@@ -912,26 +921,26 @@ <h3><a class="toc-backref" href="#id278" role="doc-backlink"><span class="sectio
 </section>
 </section>
 <section id="technical-implementation-components">
-<h2><a class="toc-backref" href="#id279" role="doc-backlink"><span class="section-number">5.6. </span>Technical Implementation Components</a><a class="headerlink" href="#technical-implementation-components" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id286" role="doc-backlink"><span class="section-number">6.6. </span>Technical Implementation Components</a><a class="headerlink" href="#technical-implementation-components" title="Permalink to this heading">¶</a></h2>
 <section id="benchmarks-datasets">
-<h3><a class="toc-backref" href="#id280" role="doc-backlink"><span class="section-number">5.6.1. </span>Benchmarks &amp; Datasets</a><a class="headerlink" href="#benchmarks-datasets" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id287" role="doc-backlink"><span class="section-number">6.6.1. </span>Benchmarks &amp; Datasets</a><a class="headerlink" href="#benchmarks-datasets" title="Permalink to this heading">¶</a></h3>
 <section id="salad-bench">
-<h4><a class="toc-backref" href="#id281" role="doc-backlink"><span class="section-number">5.6.1.1. </span>SALAD-Bench</a><a class="headerlink" href="#salad-bench" title="Permalink to this heading">¶</a></h4>
-<p>SALAD-Bench <span id="id41">[<a class="reference internal" href="#id228" title="Lijun Li, Bowen Dong, Ruohui Wang, Xuhao Hu, Wangmeng Zuo, Dahua Lin, Yu Qiao, and Jing Shao. Salad-bench: a hierarchical and comprehensive safety benchmark for large language models. 2024. URL: https://arxiv.org/abs/2402.05044, arXiv:2402.05044.">Li <em>et al.</em>, 2024</a>]</span> is a recently published benchmark designed for evaluating the safety of Large Language Models (LLMs). It aims to address limitations of prior safety benchmarks which focused on a narrow perspective of safety threats, lacked challenging questions, relied on time-consuming and costly human evaluation, and were limited in scope. SALAD-Bench offers several key features to aid in LLM safety:</p>
+<h4><a class="toc-backref" href="#id288" role="doc-backlink"><span class="section-number">6.6.1.1. </span>SALAD-Bench</a><a class="headerlink" href="#salad-bench" title="Permalink to this heading">¶</a></h4>
+<p>SALAD-Bench <span id="id41">[<a class="reference internal" href="#id235" title="Lijun Li, Bowen Dong, Ruohui Wang, Xuhao Hu, Wangmeng Zuo, Dahua Lin, Yu Qiao, and Jing Shao. Salad-bench: a hierarchical and comprehensive safety benchmark for large language models. 2024. URL: https://arxiv.org/abs/2402.05044, arXiv:2402.05044.">Li <em>et al.</em>, 2024</a>]</span> is a recently published benchmark designed for evaluating the safety of Large Language Models (LLMs). It aims to address limitations of prior safety benchmarks which focused on a narrow perspective of safety threats, lacked challenging questions, relied on time-consuming and costly human evaluation, and were limited in scope. SALAD-Bench offers several key features to aid in LLM safety:</p>
 <ul class="simple">
 <li><p><strong>Compact Taxonomy with Hierarchical Levels:</strong> It uses a structured, three-level hierarchy consisting of 6 domains, 16 tasks, and 66 categories for in-depth safety evaluation across specific dimensions. For instance,  Representation &amp; Toxicity Harms is divided into toxic content, unfair representation, and adult content. Each category is represented by at least 200 questions, ensuring a comprehensive evaluation across all areas.</p></li>
 <li><p><strong>Enhanced Difficulty and Complexity:</strong> It includes attack-enhanced questions generated using methods like human-designed prompts, red-teaming LLMs, and gradient-based methods, presenting a more stringent test of LLMs’ safety responses. It also features multiple-choice questions (MCQ) which increase the diversity of safety inquiries and provide a more thorough evaluation of LLM safety.</p></li>
 <li><p><strong>Reliable and Seamless Evaluator:</strong> SALAD-Bench features two evaluators: MD-Judge for question-answer pairs and MCQ-Judge for multiple-choice questions. MD-Judge is an LLM-based evaluator fine-tuned on standard and attack-enhanced questions labeled according to the SALAD-Bench taxonomy. It integrates taxonomy details into its input and classifies responses based on customized instruction tasks. MCQ-Judge uses in-context learning and regex parsing to assess performance on multiple-choice questions.</p></li>
 <li><p><strong>Joint-Purpose Utility:</strong> In addition to evaluating LLM safety, SALAD-Bench can be used to assess both LLM attack and defense methods. It contains subsets for testing attack techniques and examining defense capabilities, allowing researchers to improve LLM resilience against attacks.</p></li>
 </ul>
-<p><a class="reference internal" href="#id43"><span class="std std-numref">Fig. 5.10</span></a> illustrates SALAD-Bench’s question enhancement and evaluation methodology. Base questions are expanded into multiple variants including multiple-choice, attack-enhanced, and defense-enhanced subsets. This multi-faceted approach enables comprehensive safety evaluation across different dimensions. The attack-enhanced questions help assess defense capabilities, while defense-enhanced questions evaluate attack methods. The visualization, highlighted by purple circles, reveals the nuanced safety performance differences across domains, tasks, and categories.</p>
+<p><a class="reference internal" href="#id43"><span class="std std-numref">Fig. 6.10</span></a> illustrates SALAD-Bench’s question enhancement and evaluation methodology. Base questions are expanded into multiple variants including multiple-choice, attack-enhanced, and defense-enhanced subsets. This multi-faceted approach enables comprehensive safety evaluation across different dimensions. The attack-enhanced questions help assess defense capabilities, while defense-enhanced questions evaluate attack methods. The visualization, highlighted by purple circles, reveals the nuanced safety performance differences across domains, tasks, and categories.</p>
 <figure class="align-center" id="id43">
 <a class="reference internal image-reference" href="../_images/salad.png"><img alt="SALAD-Bench's compact taxonomy with hierarchical levels." src="../_images/salad.png" style="width: 70%;" /></a>
 <figcaption>
-<p><span class="caption-number">Fig. 5.10 </span><span class="caption-text">SALAD-Bench’s compact taxonomy with hierarchical levels <span id="id42">[<a class="reference internal" href="#id228" title="Lijun Li, Bowen Dong, Ruohui Wang, Xuhao Hu, Wangmeng Zuo, Dahua Lin, Yu Qiao, and Jing Shao. Salad-bench: a hierarchical and comprehensive safety benchmark for large language models. 2024. URL: https://arxiv.org/abs/2402.05044, arXiv:2402.05044.">Li <em>et al.</em>, 2024</a>]</span>.</span><a class="headerlink" href="#id43" title="Permalink to this image">¶</a></p>
+<p><span class="caption-number">Fig. 6.10 </span><span class="caption-text">SALAD-Bench’s compact taxonomy with hierarchical levels <span id="id42">[<a class="reference internal" href="#id235" title="Lijun Li, Bowen Dong, Ruohui Wang, Xuhao Hu, Wangmeng Zuo, Dahua Lin, Yu Qiao, and Jing Shao. Salad-bench: a hierarchical and comprehensive safety benchmark for large language models. 2024. URL: https://arxiv.org/abs/2402.05044, arXiv:2402.05044.">Li <em>et al.</em>, 2024</a>]</span>.</span><a class="headerlink" href="#id43" title="Permalink to this image">¶</a></p>
 </figcaption>
 </figure>
-<p>The SALAD-Bench benchmark is accompanied by a Leaderboard <span id="id44">[<a class="reference internal" href="#id230" title="OpenSafetyLab. Salad-bench leaderboard. Hugging Face Space, 2024. URL: https://huggingface.co/spaces/OpenSafetyLab/Salad-Bench-Leaderboard.">OpenSafetyLab, 2024</a>]</span> and a dataset available on Hugging Face <span id="id45">[<a class="reference internal" href="#id229" title="OpenSafetyLab. Salad-data: a hierarchical and comprehensive safety dataset for large language models. Hugging Face Dataset, 2024. URL: https://huggingface.co/datasets/OpenSafetyLab/Salad-Data.">OpenSafetyLab, 2024</a>]</span>.</p>
+<p>The SALAD-Bench benchmark is accompanied by a Leaderboard <span id="id44">[<a class="reference internal" href="#id237" title="OpenSafetyLab. Salad-bench leaderboard. Hugging Face Space, 2024. URL: https://huggingface.co/spaces/OpenSafetyLab/Salad-Bench-Leaderboard.">OpenSafetyLab, 2024</a>]</span> and a dataset available on Hugging Face <span id="id45">[<a class="reference internal" href="#id236" title="OpenSafetyLab. Salad-data: a hierarchical and comprehensive safety dataset for large language models. Hugging Face Dataset, 2024. URL: https://huggingface.co/datasets/OpenSafetyLab/Salad-Data.">OpenSafetyLab, 2024</a>]</span>.</p>
 <div class="cell docutils container">
 <div class="cell_input docutils container">
 <div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">SALAD_BENCH_DATASET</span> <span class="o">=</span> <span class="s2">&quot;OpenSafetyLab/Salad-Data&quot;</span>
@@ -943,7 +952,7 @@ <h4><a class="toc-backref" href="#id281" role="doc-backlink"><span class="sectio
 </div>
 </div>
 </div>
-<p>Each row in the dataset contains a question, an associated source, and hierarchical categories as proposed by SALAD-Bench. The question is a potentially harmful prompt to be evaluated, which has been aggregated by a source. An example of a source is “GPTFuzzer” <span id="id46">[<a class="reference internal" href="#id231" title="Jiahao Yu, Xingwei Lin, and Xinyu Xing. Gptfuzzer: red teaming large language models with auto-generated safety test cases. Papers with Code, 2024. URL: https://paperswithcode.com/dataset/gptfuzzer.">Yu <em>et al.</em>, 2024</a>]</span> which explores red teaming of large language models (LLMs) using auto-generated jailbreak prompts.</p>
+<p>Each row in the dataset contains a question, an associated source, and hierarchical categories as proposed by SALAD-Bench. The question is a potentially harmful prompt to be evaluated, which has been aggregated by a source. An example of a source is “GPTFuzzer” <span id="id46">[<a class="reference internal" href="#id238" title="Jiahao Yu, Xingwei Lin, and Xinyu Xing. Gptfuzzer: red teaming large language models with auto-generated safety test cases. Papers with Code, 2024. URL: https://paperswithcode.com/dataset/gptfuzzer.">Yu <em>et al.</em>, 2024</a>]</span> which explores red teaming of large language models (LLMs) using auto-generated jailbreak prompts.</p>
 <div class="cell docutils container">
 <div class="cell_input docutils container">
 <div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="n">display</span><span class="p">(</span><span class="n">Markdown</span><span class="p">(</span><span class="n">dataset</span><span class="o">.</span><span class="n">to_pandas</span><span class="p">()</span><span class="o">.</span><span class="n">head</span><span class="p">()</span><span class="o">.</span><span class="n">to_markdown</span><span class="p">()))</span>
@@ -1049,9 +1058,9 @@ <h4><a class="toc-backref" href="#id281" role="doc-backlink"><span class="sectio
 </div>
 </section>
 <section id="truthfulqa">
-<h4><a class="toc-backref" href="#id282" role="doc-backlink"><span class="section-number">5.6.1.2. </span>TruthfulQA</a><a class="headerlink" href="#truthfulqa" title="Permalink to this heading">¶</a></h4>
+<h4><a class="toc-backref" href="#id289" role="doc-backlink"><span class="section-number">6.6.1.2. </span>TruthfulQA</a><a class="headerlink" href="#truthfulqa" title="Permalink to this heading">¶</a></h4>
 <p>TruthfulQA <span id="id47">[<a class="reference internal" href="#id108" title="Stephanie Lin, Jacob Hilton, and Owain Evans. Truthfulqa: measuring how models mimic human falsehoods. 2022. URL: https://arxiv.org/abs/2109.07958, arXiv:2109.07958.">Lin <em>et al.</em>, 2022</a>]</span> is a benchmark designed to evaluate whether a language model is truthful in generating answers to questions. In its original version, it comprises 817 questions spanning 38 categories, including health, law, finance, and politics. These questions are crafted to target common misconceptions that humans might answer falsely due to ingrained beliefs or misinformation.</p>
-<p>TruthfulQA evaluates LLMs in two primary tasks (see <a class="reference internal" href="#truthqa"><span class="std std-numref">Fig. 5.11</span></a>):</p>
+<p>TruthfulQA evaluates LLMs in two primary tasks (see <a class="reference internal" href="#truthqa"><span class="std std-numref">Fig. 6.11</span></a>):</p>
 <ul class="simple">
 <li><p><strong>Generation:</strong> Given a question, the model is required to generate a 1-2 sentence answer. The primary objective is overall truthfulness, expressed as the percentage of the model’s answers that are true.</p></li>
 <li><p><strong>Multiple-choice:</strong> This task involves selecting the correct answer(s) from a set of options.</p></li>
@@ -1059,7 +1068,7 @@ <h4><a class="toc-backref" href="#id282" role="doc-backlink"><span class="sectio
 <figure class="align-center" id="truthqa">
 <a class="reference internal image-reference" href="../_images/truthqa.png"><img alt="TruthfulQA's evaluation methodology." src="../_images/truthqa.png" style="width: 65%;" /></a>
 <figcaption>
-<p><span class="caption-number">Fig. 5.11 </span><span class="caption-text">TruthfulQA’s evaluation methodology <span id="id48">[<a class="reference internal" href="#id108" title="Stephanie Lin, Jacob Hilton, and Owain Evans. Truthfulqa: measuring how models mimic human falsehoods. 2022. URL: https://arxiv.org/abs/2109.07958, arXiv:2109.07958.">Lin <em>et al.</em>, 2022</a>]</span>.</span><a class="headerlink" href="#truthqa" title="Permalink to this image">¶</a></p>
+<p><span class="caption-number">Fig. 6.11 </span><span class="caption-text">TruthfulQA’s evaluation methodology <span id="id48">[<a class="reference internal" href="#id108" title="Stephanie Lin, Jacob Hilton, and Owain Evans. Truthfulqa: measuring how models mimic human falsehoods. 2022. URL: https://arxiv.org/abs/2109.07958, arXiv:2109.07958.">Lin <em>et al.</em>, 2022</a>]</span>.</span><a class="headerlink" href="#truthqa" title="Permalink to this image">¶</a></p>
 </figcaption>
 </figure>
 <p>TruthfulQA employs two primary evaluation modes for its multiple-choice task:</p>
@@ -1143,8 +1152,8 @@ <h4><a class="toc-backref" href="#id282" role="doc-backlink"><span class="sectio
 </table>
 </section>
 <section id="harmbench">
-<h4><a class="toc-backref" href="#id283" role="doc-backlink"><span class="section-number">5.6.1.3. </span>HarmBench</a><a class="headerlink" href="#harmbench" title="Permalink to this heading">¶</a></h4>
-<p>HarmBench <span id="id49">[<a class="reference internal" href="#id234" title="Mantas Mazeika, Long Phan, Xuwang Yin, Andy Zou, Zifan Wang, Norman Mu, Elham Sakhaee, Nathaniel Li, Steven Basart, Bo Li, David Forsyth, and Dan Hendrycks. Harmbench: a standardized evaluation framework for automated red teaming and robust refusal. 2024. URL: https://arxiv.org/abs/2402.04249, arXiv:2402.04249.">Mazeika <em>et al.</em>, 2024</a>]</span> is a benchmark designed to evaluate the safety of LLMs. Additionally, HarmBench published a framework <span id="id50">[<a class="reference internal" href="#id235" title="Center for AI Safety. Harmbench. GitHub repository, 2024. Framework for evaluating language model safety. URL: https://github.com/centerforaisafety/HarmBench.">Center for AI Safety, 2024</a>]</span> that allows users to run two main types of evaluations:</p>
+<h4><a class="toc-backref" href="#id290" role="doc-backlink"><span class="section-number">6.6.1.3. </span>HarmBench</a><a class="headerlink" href="#harmbench" title="Permalink to this heading">¶</a></h4>
+<p>HarmBench <span id="id49">[<a class="reference internal" href="#id241" title="Mantas Mazeika, Long Phan, Xuwang Yin, Andy Zou, Zifan Wang, Norman Mu, Elham Sakhaee, Nathaniel Li, Steven Basart, Bo Li, David Forsyth, and Dan Hendrycks. Harmbench: a standardized evaluation framework for automated red teaming and robust refusal. 2024. URL: https://arxiv.org/abs/2402.04249, arXiv:2402.04249.">Mazeika <em>et al.</em>, 2024</a>]</span> is a benchmark designed to evaluate the safety of LLMs. Additionally, HarmBench published a framework <span id="id50">[<a class="reference internal" href="#id242" title="Center for AI Safety. Harmbench. GitHub repository, 2024. Framework for evaluating language model safety. URL: https://github.com/centerforaisafety/HarmBench.">Center for AI Safety, 2024</a>]</span> that allows users to run two main types of evaluations:</p>
 <ul class="simple">
 <li><p>Evaluating red teaming methods (attack methods) against a set of LLMs</p></li>
 <li><p>Evaluating LLMs against a set of red teaming methods</p></li>
@@ -1156,36 +1165,36 @@ <h4><a class="toc-backref" href="#id283" role="doc-backlink"><span class="sectio
 <li><p>Evaluating completions</p></li>
 </ul>
 <p>HarmBench primarily uses the Attack Success Rate (ASR) as its core metric. ASR measures the percentage of adversarial attempts that successfully elicit undesired behavior from the model. It also includes metrics for evaluating the effectiveness of different mitigation strategies, such as the Robust Refusal Dynamic Defense (R2D2).</p>
-<p>The framework comes with built-in support for evaluating 18 red teaming methods and 33 target LLMs, and includes classifier models for evaluating different types of behaviors (standard, contextual, and multimodal). A leaderboard is available <span id="id51">[<a class="reference internal" href="#id236" title="Center for AI Safety. Harmbench leaderboard. 2024. Leaderboard tracking performance of language models on safety benchmarks. URL: https://www.harmbench.org/results.">Center for AI Safety, 2024</a>]</span> to track performance of both language and multimodal models on safety benchmarks.</p>
+<p>The framework comes with built-in support for evaluating 18 red teaming methods and 33 target LLMs, and includes classifier models for evaluating different types of behaviors (standard, contextual, and multimodal). A leaderboard is available <span id="id51">[<a class="reference internal" href="#id243" title="Center for AI Safety. Harmbench leaderboard. 2024. Leaderboard tracking performance of language models on safety benchmarks. URL: https://www.harmbench.org/results.">Center for AI Safety, 2024</a>]</span> to track performance of both language and multimodal models on safety benchmarks.</p>
 <p>An interesting finding from HarmBench is that robustness is independent of model size which is in contrast to traditional benchmarks where larger models tend to perform better suggesting that training data and algorithms are far more important than model size in determining LLM robustness, emphasizing the importance of model-level defenses.</p>
 <figure class="align-center" id="id53">
 <a class="reference internal image-reference" href="../_images/harmbench.png"><img alt="Attack Success Rate (ASR) for different models." src="../_images/harmbench.png" style="width: 65%;" /></a>
 <figcaption>
-<p><span class="caption-number">Fig. 5.12 </span><span class="caption-text">Attack Success Rate (ASR) for different models. HarmBench’s results suggest that  robustness is independent of model size <span id="id52">[<a class="reference internal" href="#id234" title="Mantas Mazeika, Long Phan, Xuwang Yin, Andy Zou, Zifan Wang, Norman Mu, Elham Sakhaee, Nathaniel Li, Steven Basart, Bo Li, David Forsyth, and Dan Hendrycks. Harmbench: a standardized evaluation framework for automated red teaming and robust refusal. 2024. URL: https://arxiv.org/abs/2402.04249, arXiv:2402.04249.">Mazeika <em>et al.</em>, 2024</a>]</span>.</span><a class="headerlink" href="#id53" title="Permalink to this image">¶</a></p>
+<p><span class="caption-number">Fig. 6.12 </span><span class="caption-text">Attack Success Rate (ASR) for different models. HarmBench’s results suggest that  robustness is independent of model size <span id="id52">[<a class="reference internal" href="#id241" title="Mantas Mazeika, Long Phan, Xuwang Yin, Andy Zou, Zifan Wang, Norman Mu, Elham Sakhaee, Nathaniel Li, Steven Basart, Bo Li, David Forsyth, and Dan Hendrycks. Harmbench: a standardized evaluation framework for automated red teaming and robust refusal. 2024. URL: https://arxiv.org/abs/2402.04249, arXiv:2402.04249.">Mazeika <em>et al.</em>, 2024</a>]</span>.</span><a class="headerlink" href="#id53" title="Permalink to this image">¶</a></p>
 </figcaption>
 </figure>
 <p>HarmBench can be used by LLM developers to proactively identify and address potential vulnerabilities in their models before deployment. By automating the red teaming process, HarmBench allows for more efficient and scalable evaluation of LLM safety, enabling developers to test their models against a wider range of adversarial scenarios. This helps improve the robustness of LLMs and reduce the risk of malicious use.</p>
 </section>
 <section id="safebench">
-<h4><a class="toc-backref" href="#id284" role="doc-backlink"><span class="section-number">5.6.1.4. </span>SafeBench</a><a class="headerlink" href="#safebench" title="Permalink to this heading">¶</a></h4>
-<p>SafeBench <span id="id54">[<a class="reference internal" href="#id233" title="ML Safety Team. Safebench: a comprehensive benchmark for llm safety evaluation. ML Safety Website, 2024. URL: https://www.mlsafety.org/safebench.">ML Safety Team, 2024</a>]</span> is a competition designed to encourage the development of new benchmarks for assessing and mitigating risks associated with artificial intelligence.</p>
+<h4><a class="toc-backref" href="#id291" role="doc-backlink"><span class="section-number">6.6.1.4. </span>SafeBench</a><a class="headerlink" href="#safebench" title="Permalink to this heading">¶</a></h4>
+<p>SafeBench <span id="id54">[<a class="reference internal" href="#id240" title="ML Safety Team. Safebench: a comprehensive benchmark for llm safety evaluation. ML Safety Website, 2024. URL: https://www.mlsafety.org/safebench.">ML Safety Team, 2024</a>]</span> is a competition designed to encourage the development of new benchmarks for assessing and mitigating risks associated with artificial intelligence.</p>
 <p>The competition is a project of the Center for AI Safety, a non-profit research organization focused on reducing societal-scale risks from AI systems. The organization has previously developed benchmarks such as MMLU, the Weapons of Mass Destruction Proxy, and the out-of-distribution detection baseline.</p>
 <p>The goal of SafeBench is to define metrics that align with progress in addressing AI safety concerns. This is driven by the understanding that metrics play a crucial role in the field of machine learning (ML). Formalizing these metrics into benchmarks is essential for evaluating and predicting potential risks posed by AI models.</p>
 <p>The competition has outlined four categories where they would like to see benchmarks: Robustness, Monitoring, Alignment, and Safety Applications. For each of these categories, the organizers have provided examples os risks, for instance under the Robustness category is <strong>Jailbreaking Text and Multimodal Models</strong>. This focuses on improving defenses against adversarial attacks. A submitted benchmark then could tackle new and ideally unseen jailbreaking attacks and defenses.</p>
 </section>
 </section>
 <section id="tools-techniques">
-<h3><a class="toc-backref" href="#id285" role="doc-backlink"><span class="section-number">5.6.2. </span>Tools &amp; Techniques</a><a class="headerlink" href="#tools-techniques" title="Permalink to this heading">¶</a></h3>
-<p>The most straightforward approach to add a safety layer to LLM applications is to implement a separate filtering layer that screens both user prompts and LLM responses. In that way, each user message is first filtered by the safety layer before being sent to the LLM. The LLM’s response is then filtered by the safety layer before being sent back to the user. Assuming a scenario where most user messages are likely to be safe, a common design pattern to minimize latency is to send your moderation requests asynchronously along with the LLM application call as shown in <a class="reference internal" href="#safety-layer"><span class="std std-numref">Fig. 5.13</span></a>.</p>
+<h3><a class="toc-backref" href="#id292" role="doc-backlink"><span class="section-number">6.6.2. </span>Tools &amp; Techniques</a><a class="headerlink" href="#tools-techniques" title="Permalink to this heading">¶</a></h3>
+<p>The most straightforward approach to add a safety layer to LLM applications is to implement a separate filtering layer that screens both user prompts and LLM responses. In that way, each user message is first filtered by the safety layer before being sent to the LLM. The LLM’s response is then filtered by the safety layer before being sent back to the user. Assuming a scenario where most user messages are likely to be safe, a common design pattern to minimize latency is to send your moderation requests asynchronously along with the LLM application call as shown in <a class="reference internal" href="#safety-layer"><span class="std std-numref">Fig. 6.13</span></a>.</p>
 <figure class="align-center" id="safety-layer">
 <a class="reference internal image-reference" href="../_images/safety_layer.svg"><img alt="Safety Layer" src="../_images/safety_layer.svg" width="90%" /></a>
 <figcaption>
-<p><span class="caption-number">Fig. 5.13 </span><span class="caption-text">Representative Safety Layer.</span><a class="headerlink" href="#safety-layer" title="Permalink to this image">¶</a></p>
+<p><span class="caption-number">Fig. 6.13 </span><span class="caption-text">Representative Safety Layer.</span><a class="headerlink" href="#safety-layer" title="Permalink to this image">¶</a></p>
 </figcaption>
 </figure>
-<p>It is part of the design of the application to determine which risks are inherent to user prompts versus LLM responses and then implement the safety layer accordingly. For instance, <em>profanity</em> may be considered a risk inherent to both user prompts and LLM responses, while <em>jailbreaking</em> an user prompt specific risk and <em>hallucination</em> a risk inherent to LLM responses as demonstrated in <a class="reference internal" href="#safety-layer-table"><span class="std std-numref">Table 5.1</span></a>.</p>
+<p>It is part of the design of the application to determine which risks are inherent to user prompts versus LLM responses and then implement the safety layer accordingly. For instance, <em>profanity</em> may be considered a risk inherent to both user prompts and LLM responses, while <em>jailbreaking</em> an user prompt specific risk and <em>hallucination</em> a risk inherent to LLM responses as demonstrated in <a class="reference internal" href="#safety-layer-table"><span class="std std-numref">Table 6.1</span></a>.</p>
 <table class="docutils align-center" id="safety-layer-table">
-<caption><span class="caption-number">Table 5.1 </span><span class="caption-text">Representative Safety Layer Risk Map.</span><a class="headerlink" href="#safety-layer-table" title="Permalink to this table">¶</a></caption>
+<caption><span class="caption-number">Table 6.1 </span><span class="caption-text">Representative Safety Layer Risk Map.</span><a class="headerlink" href="#safety-layer-table" title="Permalink to this table">¶</a></caption>
 <thead>
 <tr class="row-odd"><th class="head"><p>Risk</p></th>
 <th class="head"><p>Prompt</p></th>
@@ -1213,10 +1222,10 @@ <h3><a class="toc-backref" href="#id285" role="doc-backlink"><span class="sectio
 </table>
 <p>There are several specialized commercial and open source tools that can be used to implement a filtering layer, which we can categorize into two types: Rules-Based and LLM-Based.</p>
 <section id="rules-based-safety-filtering">
-<h4><a class="toc-backref" href="#id286" role="doc-backlink"><span class="section-number">5.6.2.1. </span>Rules-Based Safety Filtering</a><a class="headerlink" href="#rules-based-safety-filtering" title="Permalink to this heading">¶</a></h4>
-<p>Examples of tools that can be used as rules-based safety filters are Webpurify, LLM-Guard <span id="id55">[<a class="reference internal" href="#id243" title="ProtectAI. Llm-guard: comprehensive safety and security framework for large language models. 2024. An open-source toolkit for LLM security and safety. URL: https://github.com/protectai/llm-guard.">ProtectAI, 2024</a>]</span>, AWS Comprehend <span id="id56">[<a class="reference internal" href="#id245" title="Amazon Web Services. Amazon comprehend - natural language processing service. 2024. AWS natural language processing service for text analysis and content moderation. URL: https://aws.amazon.com/comprehend/.">Amazon Web Services, 2024</a>]</span>, and NeMo Guardrails <span id="id57">[<a class="reference internal" href="#id244" title="NVIDIA. Nemo-guardrails: an open-source toolkit for building reliable and safe llm applications. 2024. A framework for creating reliable and safe LLM applications with customizable guardrails. URL: https://github.com/NVIDIA/NeMo-Guardrails.">NVIDIA, 2024</a>]</span> as detailed in <a class="reference internal" href="#safety-layer-tools"><span class="std std-numref">Table 5.2</span></a>.</p>
+<h4><a class="toc-backref" href="#id293" role="doc-backlink"><span class="section-number">6.6.2.1. </span>Rules-Based Safety Filtering</a><a class="headerlink" href="#rules-based-safety-filtering" title="Permalink to this heading">¶</a></h4>
+<p>Examples of tools that can be used as rules-based safety filters are Webpurify, LLM-Guard <span id="id55">[<a class="reference internal" href="#id250" title="ProtectAI. Llm-guard: comprehensive safety and security framework for large language models. 2024. An open-source toolkit for LLM security and safety. URL: https://github.com/protectai/llm-guard.">ProtectAI, 2024</a>]</span>, AWS Comprehend <span id="id56">[<a class="reference internal" href="#id252" title="Amazon Web Services. Amazon comprehend - natural language processing service. 2024. AWS natural language processing service for text analysis and content moderation. URL: https://aws.amazon.com/comprehend/.">Amazon Web Services, 2024</a>]</span>, and NeMo Guardrails <span id="id57">[<a class="reference internal" href="#id251" title="NVIDIA. Nemo-guardrails: an open-source toolkit for building reliable and safe llm applications. 2024. A framework for creating reliable and safe LLM applications with customizable guardrails. URL: https://github.com/NVIDIA/NeMo-Guardrails.">NVIDIA, 2024</a>]</span> as detailed in <a class="reference internal" href="#safety-layer-tools"><span class="std std-numref">Table 6.2</span></a>.</p>
 <table class="docutils align-default" id="safety-layer-tools">
-<caption><span class="caption-number">Table 5.2 </span><span class="caption-text">Rules-Based Safety Filtering Tools.</span><a class="headerlink" href="#safety-layer-tools" title="Permalink to this table">¶</a></caption>
+<caption><span class="caption-number">Table 6.2 </span><span class="caption-text">Rules-Based Safety Filtering Tools.</span><a class="headerlink" href="#safety-layer-tools" title="Permalink to this table">¶</a></caption>
 <thead>
 <tr class="row-odd"><th class="head"><p>Tool</p></th>
 <th class="head"><p>Key Features</p></th>
@@ -1275,13 +1284,13 @@ <h4><a class="toc-backref" href="#id286" role="doc-backlink"><span class="sectio
 </div>
 </section>
 <section id="llm-based-safety-filtering">
-<h4><a class="toc-backref" href="#id287" role="doc-backlink"><span class="section-number">5.6.2.2. </span>LLM-Based Safety Filtering</a><a class="headerlink" href="#llm-based-safety-filtering" title="Permalink to this heading">¶</a></h4>
+<h4><a class="toc-backref" href="#id294" role="doc-backlink"><span class="section-number">6.6.2.2. </span>LLM-Based Safety Filtering</a><a class="headerlink" href="#llm-based-safety-filtering" title="Permalink to this heading">¶</a></h4>
 <p>Alternatively, an LLM-based component can be used as a content filter. Here, we observe three types os approaches:
 1. Moderation API,
 2. Fine-Tuned Open Source Models, and
 3. Custom Moderation.</p>
 <p>Model providers such as OpenAI, and Mistral offer moderation APIs that can be used to filter content. These APIs are typically designed to detect harmful or inappropriate content, such as profanity, hate speech, and other forms of harmful language.</p>
-<p>Mistral’s Moderation API <span id="id58">[<a class="reference internal" href="#id241" title="Mistral AI. Mistral moderation: a technical report. 2024. URL: https://mistral.ai/news/mistral-moderation/.">Mistral AI, 2024</a>]</span>, released in November/2024, is a classifier model based on Ministral 8B 24.10. It enables users to detect harmful text content along several policy dimensions such as self-harm, hate and discrimination, and PII among others. It can be used to classify both raw text or conversational content. We will cover this API in more detail in the Case Study.</p>
+<p>Mistral’s Moderation API <span id="id58">[<a class="reference internal" href="#id248" title="Mistral AI. Mistral moderation: a technical report. 2024. URL: https://mistral.ai/news/mistral-moderation/.">Mistral AI, 2024</a>]</span>, released in November/2024, is a classifier model based on Ministral 8B 24.10. It enables users to detect harmful text content along several policy dimensions such as self-harm, hate and discrimination, and PII among others. It can be used to classify both raw text or conversational content. We will cover this API in more detail in the Case Study.</p>
 <div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="c1"># Mistral&#39;s Moderation API - Raw Text</span>
 <span class="kn">import</span> <span class="nn">os</span>
 <span class="kn">from</span> <span class="nn">mistralai</span> <span class="kn">import</span> <span class="n">Mistral</span>
@@ -1317,7 +1326,7 @@ <h4><a class="toc-backref" href="#id287" role="doc-backlink"><span class="sectio
 <span class="nb">print</span><span class="p">(</span><span class="n">response</span><span class="p">)</span>
 </pre></div>
 </div>
-<p>OpenAI’s Moderation API <span id="id59">[<a class="reference internal" href="#id242" title="OpenAI. Openai moderation api. 2024. Documentation for OpenAI's content moderation API. URL: https://platform.openai.com/docs/guides/moderation.">OpenAI, 2024</a>]</span> is free of use and can be accessed via the base model name <code class="docutils literal notranslate"><span class="pre">omni-moderation</span></code>. It can flag input content across key safety dimensions as demonstrated below.</p>
+<p>OpenAI’s Moderation API <span id="id59">[<a class="reference internal" href="#id249" title="OpenAI. Openai moderation api. 2024. Documentation for OpenAI's content moderation API. URL: https://platform.openai.com/docs/guides/moderation.">OpenAI, 2024</a>]</span> is free of use and can be accessed via the base model name <code class="docutils literal notranslate"><span class="pre">omni-moderation</span></code>. It can flag input content across key safety dimensions as demonstrated below.</p>
 <div class="cell docutils container">
 <div class="cell_input docutils container">
 <div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">dotenv</span> <span class="kn">import</span> <span class="n">load_dotenv</span>
@@ -1466,29 +1475,29 @@ <h4><a class="toc-backref" href="#id287" role="doc-backlink"><span class="sectio
 <li><p>S12: Sexual Content.</p></li>
 <li><p>S13: Elections.</p></li>
 </ul>
-<p><strong>IBM Granite Guardian</strong> is a new competitor to Llama Guard family. It is collection of models designed to help govern key risk dimensions as defined by IBM’s AI Risk Atlas <span id="id60">[<a class="reference internal" href="#id246" title="IBM. Ibm watsonx.ai risk atlas. 2024. A framework for identifying and mitigating risks in AI systems. URL: https://www.ibm.com/docs/en/watsonx/saas?topic=ai-risk-atlas.">IBM, 2024</a>]</span>. The collection comprises two classes of models:</p>
+<p><strong>IBM Granite Guardian</strong> is a new competitor to Llama Guard family. It is collection of models designed to help govern key risk dimensions as defined by IBM’s AI Risk Atlas <span id="id60">[<a class="reference internal" href="#id253" title="IBM. Ibm watsonx.ai risk atlas. 2024. A framework for identifying and mitigating risks in AI systems. URL: https://www.ibm.com/docs/en/watsonx/saas?topic=ai-risk-atlas.">IBM, 2024</a>]</span>. The collection comprises two classes of models:</p>
 <ol class="arabic simple">
 <li><p>Granite-Guardian-3.0-2B and Granite-Guardian-3.0-8B for detecting different forms of harmful content</p></li>
 <li><p>Granite Guardian HAP 38M and Granite Guardian HAP 125M for detecting toxic content.</p></li>
 </ol>
-<p>In a paper from December/2024 <span id="id61">[<a class="reference internal" href="#id247" title="Inkit Padhi, Manish Nagireddy, Giandomenico Cornacchia, Subhajit Chaudhury, Tejaswini Pedapati, Pierre Dognin, Keerthiram Murugesan, Erik Miehling, Martín Santillán Cooper, Kieran Fraser, Giulio Zizzo, Muhammad Zaid Hameed, Mark Purcell, Michael Desmond, Qian Pan, Inge Vejsbjerg, Elizabeth M. Daly, Michael Hind, Werner Geyer, Ambrish Rawat, Kush R. Varshney, and Prasanna Sattigeri. Granite guardian. 2024. URL: https://arxiv.org/abs/2412.07724, arXiv:2412.07724.">Padhi <em>et al.</em>, 2024</a>]</span>, the authors describe Granite Guardian as a model fine-tuned on a training dataset that combines open-source, synthetic and human annotated data achieving superior performance than state-of-the-art comparable model families. In <a class="reference internal" href="#granite"><span class="std std-numref">Fig. 5.14</span></a> we observe that IBM Granite Guardian performance is overall superior compared to Llama-Guard and ShieldGemma model families for the “Harm” risk dimension.</p>
+<p>In a paper from December/2024 <span id="id61">[<a class="reference internal" href="#id254" title="Inkit Padhi, Manish Nagireddy, Giandomenico Cornacchia, Subhajit Chaudhury, Tejaswini Pedapati, Pierre Dognin, Keerthiram Murugesan, Erik Miehling, Martín Santillán Cooper, Kieran Fraser, Giulio Zizzo, Muhammad Zaid Hameed, Mark Purcell, Michael Desmond, Qian Pan, Inge Vejsbjerg, Elizabeth M. Daly, Michael Hind, Werner Geyer, Ambrish Rawat, Kush R. Varshney, and Prasanna Sattigeri. Granite guardian. 2024. URL: https://arxiv.org/abs/2412.07724, arXiv:2412.07724.">Padhi <em>et al.</em>, 2024</a>]</span>, the authors describe Granite Guardian as a model fine-tuned on a training dataset that combines open-source, synthetic and human annotated data achieving superior performance than state-of-the-art comparable model families. In <a class="reference internal" href="#granite"><span class="std std-numref">Fig. 6.14</span></a> we observe that IBM Granite Guardian performance is overall superior compared to Llama-Guard and ShieldGemma model families for the “Harm” risk dimension.</p>
 <figure class="align-center" id="granite">
 <a class="reference internal image-reference" href="../_images/granite.png"><img alt="IBM Granite Guardian performance for the &quot;Harm&quot; risk dimension." src="../_images/granite.png" style="width: 65%;" /></a>
 <figcaption>
-<p><span class="caption-number">Fig. 5.14 </span><span class="caption-text">IBM Granite Guardian performance is superior compared to Llama-Guard and ShieldGemma model families for the “Harm” risk dimension <span id="id62">[<a class="reference internal" href="#id247" title="Inkit Padhi, Manish Nagireddy, Giandomenico Cornacchia, Subhajit Chaudhury, Tejaswini Pedapati, Pierre Dognin, Keerthiram Murugesan, Erik Miehling, Martín Santillán Cooper, Kieran Fraser, Giulio Zizzo, Muhammad Zaid Hameed, Mark Purcell, Michael Desmond, Qian Pan, Inge Vejsbjerg, Elizabeth M. Daly, Michael Hind, Werner Geyer, Ambrish Rawat, Kush R. Varshney, and Prasanna Sattigeri. Granite guardian. 2024. URL: https://arxiv.org/abs/2412.07724, arXiv:2412.07724.">Padhi <em>et al.</em>, 2024</a>]</span>.</span><a class="headerlink" href="#granite" title="Permalink to this image">¶</a></p>
+<p><span class="caption-number">Fig. 6.14 </span><span class="caption-text">IBM Granite Guardian performance is superior compared to Llama-Guard and ShieldGemma model families for the “Harm” risk dimension <span id="id62">[<a class="reference internal" href="#id254" title="Inkit Padhi, Manish Nagireddy, Giandomenico Cornacchia, Subhajit Chaudhury, Tejaswini Pedapati, Pierre Dognin, Keerthiram Murugesan, Erik Miehling, Martín Santillán Cooper, Kieran Fraser, Giulio Zizzo, Muhammad Zaid Hameed, Mark Purcell, Michael Desmond, Qian Pan, Inge Vejsbjerg, Elizabeth M. Daly, Michael Hind, Werner Geyer, Ambrish Rawat, Kush R. Varshney, and Prasanna Sattigeri. Granite guardian. 2024. URL: https://arxiv.org/abs/2412.07724, arXiv:2412.07724.">Padhi <em>et al.</em>, 2024</a>]</span>.</span><a class="headerlink" href="#granite" title="Permalink to this image">¶</a></p>
 </figcaption>
 </figure>
 <p>The industry is increasingly focusing on the fine-tuning of pre-trained base models targeting a specific dimension of requirements and standards, here Safety being a critical one. This trend encompasses the release of open-source, fine-tuned safety models that can act as protective guardrails for LLM applications, as exemplified by LLaMa-Guard and IBM Granite Guardian. Additionally, there is a notable rise in models fine-tuned through techniques such as Reinforcement Learning from Human Feedback (RLHF), utilizing human preference datasets that incorporate safety considerations. These specialized models can function as safety filters as discussed but also as main models that alone could accomplished their original intended task but safely. We will cover this specific topic of preference-based alignment in the next Chapter <a class="reference internal" href="alignment.html#alignment"><span class="std std-ref">Preference-Based Alignment</span></a>, where we will explore the process of aligning language models with human preferences ultimately leading to the development of an open source fine-tuned model that complies with user provided policy-based requirements.</p>
 </section>
 <section id="custom-moderation">
-<h4><a class="toc-backref" href="#id288" role="doc-backlink"><span class="section-number">5.6.2.3. </span>Custom Moderation</a><a class="headerlink" href="#custom-moderation" title="Permalink to this heading">¶</a></h4>
+<h4><a class="toc-backref" href="#id295" role="doc-backlink"><span class="section-number">6.6.2.3. </span>Custom Moderation</a><a class="headerlink" href="#custom-moderation" title="Permalink to this heading">¶</a></h4>
 <p>We have covered filtering-based approaches using moderation APIs and fine-tuned open source models. Rather than relying on external filters, LLMs themselves can be guided to avoid harmful content through careful prompt engineering.</p>
 <p>Custom moderation offers a tailored content filtering approach, ensuring adherence to your own specific standards. As we have seen, each filtering-based approach we have discussed, while each having its own strengths, they all implement or enable safety according to a pre-defined dimension of requirements and standards. Custom moderation, on the other hand, provides greater control compared to general moderation APIs or fine-tuned open source models though it requires more setup and maintenance.</p>
-<p>A common approach, when building a custom LLM-based filter, is to build an LLM-as-a-Judge filter as illustrated in <a class="reference internal" href="#judge-safety"><span class="std std-numref">Fig. 5.15</span></a>. It a simple idea to use an LLM to judge the output of another LLM as well as user prompt in the context of your LLM-based application (please see <a class="reference external" href="https://www.souzatharsis.com/tamingLLMs/notebooks/evals.html#model-based-evaluation">Section “Model Based Evaluation” - Chapter Evals</a> for design and best practices of LLM-based evals.)</p>
+<p>A common approach, when building a custom LLM-based filter, is to build an LLM-as-a-Judge filter as illustrated in <a class="reference internal" href="#judge-safety"><span class="std std-numref">Fig. 6.15</span></a>. It a simple idea to use an LLM to judge the output of another LLM as well as user prompt in the context of your LLM-based application (please see <a class="reference external" href="https://www.souzatharsis.com/tamingLLMs/notebooks/evals.html#model-based-evaluation">Section “Model Based Evaluation” - Chapter Evals</a> for design and best practices of LLM-based evals.)</p>
 <figure class="align-center" id="judge-safety">
 <a class="reference internal image-reference" href="../_images/judge.svg"><img alt="LLM-as-a-judge as safety filter." src="../_images/judge.svg" width="95%" /></a>
 <figcaption>
-<p><span class="caption-number">Fig. 5.15 </span><span class="caption-text">LLM-as-a-judge as safety filter.</span><a class="headerlink" href="#judge-safety" title="Permalink to this image">¶</a></p>
+<p><span class="caption-number">Fig. 6.15 </span><span class="caption-text">LLM-as-a-judge as safety filter.</span><a class="headerlink" href="#judge-safety" title="Permalink to this image">¶</a></p>
 </figcaption>
 </figure>
 <p>Below we display an example of a prompt engineered for an LLM-as-a-judge to be used as a safety filter.</p>
@@ -1558,17 +1567,17 @@ <h4><a class="toc-backref" href="#id288" role="doc-backlink"><span class="sectio
 </section>
 </section>
 <section id="case-study-implementing-a-safety-filter">
-<h2><a class="toc-backref" href="#id289" role="doc-backlink"><span class="section-number">5.7. </span>Case Study: Implementing a Safety Filter</a><a class="headerlink" href="#case-study-implementing-a-safety-filter" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id296" role="doc-backlink"><span class="section-number">6.7. </span>Case Study: Implementing a Safety Filter</a><a class="headerlink" href="#case-study-implementing-a-safety-filter" title="Permalink to this heading">¶</a></h2>
 <p>We will implement a basic safety filter for a K-12 application that will be used to filter content in a chat interface. The application will be designed to be used in a classroom setting where students and teachers can interact with the model to ask questions and receive answers. The safety filter will be designed to filter out harmful content such as profanity, hate speech, and other inappropriate content.</p>
 <p>In this stylized case study, we will limit our scope to the implementation of a safety filter for user prompts. We will not cover the implementation of the application itself or filtering the model’s output but rather focus on the user prompt safety filter. In real-world applications, an input policy would be paramount to better define what safety means before we identify associated risks and consecutive implementation decisions. Here, we will discuss the implementation of safety through the design of the evals dataset (you will later see, skipping policy will lead to trouble later in the case study!)</p>
 <section id="evals-dataset">
-<h3><a class="toc-backref" href="#id290" role="doc-backlink"><span class="section-number">5.7.1. </span>Evals Dataset</a><a class="headerlink" href="#evals-dataset" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id297" role="doc-backlink"><span class="section-number">6.7.1. </span>Evals Dataset</a><a class="headerlink" href="#evals-dataset" title="Permalink to this heading">¶</a></h3>
 <p>Creating a balanced evaluation dataset is crucial for developing robust safety measures. The dataset should be a well balanced set of “good” and “bad” samples to avoid biasing the model’s behavior in either direction.</p>
 <p>For this evaluation, we will create a dataset with <code class="docutils literal notranslate"><span class="pre">NUM_SAMPLES</span></code> examples, evenly split between good and bad samples (<code class="docutils literal notranslate"><span class="pre">GOOD_SAMPLES</span></code> and <code class="docutils literal notranslate"><span class="pre">BAD_SAMPLES</span></code>, respectively).</p>
 <p>The good samples will be sourced from the UltraFeedback Binarized dataset <span id="id63">[<a class="reference internal" href="#id169" title="Hugging Face H4. Ultrafeedback binarized dataset. 2024z. A dataset of binary preference data for training language models. URL: https://huggingface.co/datasets/HuggingFaceH4/ultrafeedback_binarized.">H4, 2024z</a>]</span>, which contains high-quality, appropriate prompts that represent normal user interactions, often utilized to fine-tune models for instruction-following, truthfulness, honesty and helpfulness in a preference-based alignment process.</p>
 <p>The bad samples will come from two sources:</p>
 <ol class="arabic simple">
-<li><p>Profanity keywords from the Surge AI Profanity Dataset <span id="id64">[<a class="reference internal" href="#id238" title="Surge AI. Surge ai profanity dataset. GitHub repository, 2024. A comprehensive dataset for training and evaluating profanity detection models. URL: https://github.com/surge-ai/profanity.">Surge AI, 2024</a>]</span> - This provides examples of explicit inappropriate content.</p></li>
+<li><p>Profanity keywords from the Surge AI Profanity Dataset <span id="id64">[<a class="reference internal" href="#id245" title="Surge AI. Surge ai profanity dataset. GitHub repository, 2024. A comprehensive dataset for training and evaluating profanity detection models. URL: https://github.com/surge-ai/profanity.">Surge AI, 2024</a>]</span> - This provides examples of explicit inappropriate content.</p></li>
 <li><p>Prompts sourced from Salad-Bench - These represent more subtle forms of harmful content like scams, harassment, or dangerous instructions, hence not necessarily mentioning an inappropriate keywords but rather a potentially harmful instruction.</p></li>
 </ol>
 <p>This balanced approach helps ensure our safety measures can effectively identify explicit and nuanced harmful content while minimizing false positives across diverse real-world scenarios.</p>
@@ -1581,7 +1590,7 @@ <h3><a class="toc-backref" href="#id290" role="doc-backlink"><span class="sectio
 </div>
 </div>
 <section id="bad-samples">
-<h4><a class="toc-backref" href="#id291" role="doc-backlink"><span class="section-number">5.7.1.1. </span>Bad Samples</a><a class="headerlink" href="#bad-samples" title="Permalink to this heading">¶</a></h4>
+<h4><a class="toc-backref" href="#id298" role="doc-backlink"><span class="section-number">6.7.1.1. </span>Bad Samples</a><a class="headerlink" href="#bad-samples" title="Permalink to this heading">¶</a></h4>
 <div class="cell docutils container">
 <div class="cell_input docutils container">
 <div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="k">def</span> <span class="nf">get_profanity_samples</span><span class="p">(</span><span class="n">num_samples</span><span class="p">,</span> <span class="n">show_stats</span><span class="o">=</span><span class="kc">True</span><span class="p">):</span>
@@ -1723,7 +1732,7 @@ <h4><a class="toc-backref" href="#id291" role="doc-backlink"><span class="sectio
 </div>
 </section>
 <section id="good-samples">
-<h4><a class="toc-backref" href="#id292" role="doc-backlink"><span class="section-number">5.7.1.2. </span>Good Samples</a><a class="headerlink" href="#good-samples" title="Permalink to this heading">¶</a></h4>
+<h4><a class="toc-backref" href="#id299" role="doc-backlink"><span class="section-number">6.7.1.2. </span>Good Samples</a><a class="headerlink" href="#good-samples" title="Permalink to this heading">¶</a></h4>
 <div class="cell docutils container">
 <div class="cell_input docutils container">
 <div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="k">def</span> <span class="nf">get_good_samples</span><span class="p">(</span><span class="n">num_samples</span><span class="p">):</span>
@@ -1904,7 +1913,7 @@ <h4><a class="toc-backref" href="#id292" role="doc-backlink"><span class="sectio
 </section>
 </section>
 <section id="safety-filters">
-<h3><a class="toc-backref" href="#id293" role="doc-backlink"><span class="section-number">5.7.2. </span>Safety Filters</a><a class="headerlink" href="#safety-filters" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id300" role="doc-backlink"><span class="section-number">6.7.2. </span>Safety Filters</a><a class="headerlink" href="#safety-filters" title="Permalink to this heading">¶</a></h3>
 <p>We will implement four safety filters, one for each of the following:</p>
 <ol class="arabic simple">
 <li><p>LLM-Guard</p></li>
@@ -1970,7 +1979,7 @@ <h3><a class="toc-backref" href="#id293" role="doc-backlink"><span class="sectio
 </div>
 </div>
 <section id="llm-guard">
-<h4><a class="toc-backref" href="#id294" role="doc-backlink"><span class="section-number">5.7.2.1. </span>LLM-Guard</a><a class="headerlink" href="#llm-guard" title="Permalink to this heading">¶</a></h4>
+<h4><a class="toc-backref" href="#id301" role="doc-backlink"><span class="section-number">6.7.2.1. </span>LLM-Guard</a><a class="headerlink" href="#llm-guard" title="Permalink to this heading">¶</a></h4>
 <p>Next, we implement a concrete validator using LLM Guard. The <code class="docutils literal notranslate"><span class="pre">LLMGuardValidator</span></code> class combines two key scanners:</p>
 <ul class="simple">
 <li><p>BanTopics: Flags content containing banned topics</p></li>
@@ -2063,7 +2072,7 @@ <h4><a class="toc-backref" href="#id294" role="doc-backlink"><span class="sectio
 </div>
 </section>
 <section id="mistral-moderation-api">
-<h4><a class="toc-backref" href="#id295" role="doc-backlink"><span class="section-number">5.7.2.2. </span>Mistral Moderation API</a><a class="headerlink" href="#mistral-moderation-api" title="Permalink to this heading">¶</a></h4>
+<h4><a class="toc-backref" href="#id302" role="doc-backlink"><span class="section-number">6.7.2.2. </span>Mistral Moderation API</a><a class="headerlink" href="#mistral-moderation-api" title="Permalink to this heading">¶</a></h4>
 <p>You will need a Mistral API key to use the Mistral Moderation API. You can get one by signing up for a Mistral account and creating an API key, which we will assume is stored in a local <code class="docutils literal notranslate"><span class="pre">.env</span></code> file under the <code class="docutils literal notranslate"><span class="pre">MISTRAL_API_KEY</span></code> variable.</p>
 <p>The <code class="docutils literal notranslate"><span class="pre">MistralValidator</span></code> class implements a safety validator using Mistral’s moderation API. It takes text input and returns a ValidationResult indicating whether the text is unsafe based on Mistral moderation categories. Example:</p>
 <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="p">{</span><span class="s1">&#39;sexual&#39;</span><span class="p">:</span> <span class="kc">False</span><span class="p">,</span>
@@ -2143,7 +2152,7 @@ <h4><a class="toc-backref" href="#id295" role="doc-backlink"><span class="sectio
 </div>
 </section>
 <section id="openai-moderation-api">
-<h4><a class="toc-backref" href="#id296" role="doc-backlink"><span class="section-number">5.7.2.3. </span>OpenAI Moderation API</a><a class="headerlink" href="#openai-moderation-api" title="Permalink to this heading">¶</a></h4>
+<h4><a class="toc-backref" href="#id303" role="doc-backlink"><span class="section-number">6.7.2.3. </span>OpenAI Moderation API</a><a class="headerlink" href="#openai-moderation-api" title="Permalink to this heading">¶</a></h4>
 <div class="cell docutils container">
 <div class="cell_input docutils container">
 <div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">openai</span> <span class="kn">import</span> <span class="n">OpenAI</span>
@@ -2207,7 +2216,7 @@ <h4><a class="toc-backref" href="#id296" role="doc-backlink"><span class="sectio
 </div>
 </section>
 <section id="custom-judge-validator">
-<h4><a class="toc-backref" href="#id297" role="doc-backlink"><span class="section-number">5.7.2.4. </span>Custom Judge Validator</a><a class="headerlink" href="#custom-judge-validator" title="Permalink to this heading">¶</a></h4>
+<h4><a class="toc-backref" href="#id304" role="doc-backlink"><span class="section-number">6.7.2.4. </span>Custom Judge Validator</a><a class="headerlink" href="#custom-judge-validator" title="Permalink to this heading">¶</a></h4>
 <p>The <code class="docutils literal notranslate"><span class="pre">LLMJudgeValidator</span></code> class implements a safety validator using GPT-4o-mini. It takes text input and returns a ValidationResult indicating whether the text is unsafe based on an input safety prompt.</p>
 <div class="cell docutils container">
 <div class="cell_input docutils container">
@@ -2292,7 +2301,7 @@ <h4><a class="toc-backref" href="#id297" role="doc-backlink"><span class="sectio
 </section>
 </section>
 <section id="benchmarking">
-<h3><a class="toc-backref" href="#id298" role="doc-backlink"><span class="section-number">5.7.3. </span>Benchmarking</a><a class="headerlink" href="#benchmarking" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id305" role="doc-backlink"><span class="section-number">6.7.3. </span>Benchmarking</a><a class="headerlink" href="#benchmarking" title="Permalink to this heading">¶</a></h3>
 <p>We are ready to run our four safety filters against our dataset. We will store validation results as well as elapsed time for each validator.</p>
 <div class="cell docutils container">
 <div class="cell_input docutils container">
@@ -2781,7 +2790,7 @@ <h3><a class="toc-backref" href="#id298" role="doc-backlink"><span class="sectio
 <p>Having said that, I want to be clear that further investigation is needed before one could claim that the dataset is unsafe. Here, we only show anecdotal evidence that the dataset contains unsafe content for our particular case study. We do not claim that the dataset is unsafe per se. Instead, a superior experiment would have constructed a proper dataset that more closely matches what safe conversations look like in the application domain we are studying.</p>
 </section>
 <section id="takeaways">
-<h3><a class="toc-backref" href="#id299" role="doc-backlink"><span class="section-number">5.7.4. </span>Takeaways</a><a class="headerlink" href="#takeaways" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id306" role="doc-backlink"><span class="section-number">6.7.4. </span>Takeaways</a><a class="headerlink" href="#takeaways" title="Permalink to this heading">¶</a></h3>
 <ul class="simple">
 <li><p>Safety is a complex problem and there is no one-size-fits-all solution.</p></li>
 <li><p>Starting with a well-aligned policy is key to developing a robust data and evaluation framework.</p></li>
@@ -2791,14 +2800,14 @@ <h3><a class="toc-backref" href="#id299" role="doc-backlink"><span class="sectio
 </section>
 </section>
 <section id="conclusion">
-<h2><a class="toc-backref" href="#id300" role="doc-backlink"><span class="section-number">5.8. </span>Conclusion</a><a class="headerlink" href="#conclusion" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id307" role="doc-backlink"><span class="section-number">6.8. </span>Conclusion</a><a class="headerlink" href="#conclusion" title="Permalink to this heading">¶</a></h2>
 <p>The rapid advancement of large language models has created an unsettling paradox: the same technologies that promise to revolutionize human-AI interaction also harbor significant risks that could undermine the very societies they aim to benefit. Our examination of various safety measures - from constitutional AI to red teaming - reveals that each approach has specific strengths and limitations when implemented in practice. However, instead of waiting for governments, organizations, and the public to catch up, we need to take action now.</p>
 <p>The case study on safety filters demonstrated the complexity of implementing even basic safety measures in real-world applications. What appears safe in one context may be inappropriate in another, and our current methods of safety evaluation often struggle with these nuances. The challenge of developing robust safety measures is further complicated by the potential for feedback loops in the training process - when models are fine-tuned on datasets that may contain hidden biases or problematic content.</p>
 <p>The path forward requires combining technical innovation with practical domain-specific wisdom. Safety in GenAI isn’t just a technical problem to be solved - it’s a mirror reflecting our own values, biases, and aspirations back at us. The growing focus on safety across the AI community, from open-source initiatives to corporate governance frameworks, provides a foundation for developing more robust safety measures. However, technologists working in isolation cannot solve these challenges - and may even perpetuate them unknowingly. Instead, domain experts across different verticals must come together to collaboratively define what safety means in the context of their specific users and broader society in work in collaboration with the AI community.</p>
 <p>Only through this cross-disciplinary collaboration can we move beyond the current uncertainty into a future where safety and innovation reinforce rather than oppose each other. This requires building bridges between technical experts, ethicists, policymakers, and the communities they serve to develop holistic frameworks that protect while enabling progress.</p>
 </section>
 <section id="citation">
-<h2><a class="toc-backref" href="#id301" role="doc-backlink"><span class="section-number">5.9. </span>Citation</a><a class="headerlink" href="#citation" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id308" role="doc-backlink"><span class="section-number">6.9. </span>Citation</a><a class="headerlink" href="#citation" title="Permalink to this heading">¶</a></h2>
 <p><a class="reference external" href="http://creativecommons.org/licenses/by-nc-sa/4.0/"><img alt="CC BY-NC-SA 4.0" src="https://licensebuttons.net/l/by-nc-sa/4.0/88x31.png" /></a></p>
 <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="nd">@misc</span><span class="p">{</span><span class="n">tharsistpsouza2024tamingllms</span><span class="p">,</span>
   <span class="n">author</span> <span class="o">=</span> <span class="p">{</span><span class="n">Tharsis</span> <span class="n">T</span><span class="o">.</span> <span class="n">P</span><span class="o">.</span> <span class="n">Souza</span><span class="p">},</span>
@@ -2812,51 +2821,47 @@ <h2><a class="toc-backref" href="#id301" role="doc-backlink"><span class="sectio
 </div>
 </section>
 <section id="references">
-<h2><a class="toc-backref" href="#id302" role="doc-backlink"><span class="section-number">5.10. </span>References</a><a class="headerlink" href="#references" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id309" role="doc-backlink"><span class="section-number">6.10. </span>References</a><a class="headerlink" href="#references" title="Permalink to this heading">¶</a></h2>
 <div class="docutils container" id="id65">
-<div class="citation" id="id192" role="doc-biblioentry">
-<span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id30">AI24</a><span class="fn-bracket">]</span></span>
-<p>Meta AI. Llamaguard: llm-based input-output safeguard for human-ai conversations. Meta AI Research Publications, 2024. URL: <a class="reference external" href="https://ai.meta.com/research/publications/llama-guard-llm-based-input-output-safeguard-for-human-ai-conversations/">https://ai.meta.com/research/publications/llama-guard-llm-based-input-output-safeguard-for-human-ai-conversations/</a>.</p>
-</div>
-<div class="citation" id="id217" role="doc-biblioentry">
+<div class="citation" id="id224" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span>ASA24<span class="fn-bracket">]</span></span>
 <span class="backrefs">(<a role="doc-backlink" href="#id33">1</a>,<a role="doc-backlink" href="#id34">2</a>)</span>
 <p>Jide Alaga, Jonas Schuett, and Markus Anderljung. A grading rubric for ai safety frameworks. 2024. URL: <a class="reference external" href="https://arxiv.org/abs/2409.08751">https://arxiv.org/abs/2409.08751</a>, <a class="reference external" href="https://arxiv.org/abs/2409.08751">arXiv:2409.08751</a>.</p>
 </div>
-<div class="citation" id="id227" role="doc-biblioentry">
+<div class="citation" id="id234" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span>ABC+23<span class="fn-bracket">]</span></span>
 <span class="backrefs">(<a role="doc-backlink" href="#id38">1</a>,<a role="doc-backlink" href="#id39">2</a>)</span>
 <p>Amanda Askell, Yuntao Bai, Anna Chen, Deep Ganguli, Danny Hernandez, Jared Kaplan, Jackson Kernion, Ben Mann, Catherine Olsson, and Paul Christiano. Constitutional ai: harmlessness from ai feedback. 2023. URL: <a class="reference external" href="https://www.anthropic.com/research/constitutional-ai-harmlessness-from-ai-feedback">https://www.anthropic.com/research/constitutional-ai-harmlessness-from-ai-feedback</a>.</p>
 </div>
-<div class="citation" id="id211" role="doc-biblioentry">
+<div class="citation" id="id218" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id4">BHY+24</a><span class="fn-bracket">]</span></span>
 <p>Yoshua Bengio, Geoffrey Hinton, Andrew Yao, Dawn Song, Pieter Abbeel, Trevor Darrell, Yuval Noah Harari, Ya-Qin Zhang, Lan Xue, Shai Shalev-Shwartz, Gillian Hadfield, Jeff Clune, Tegan Maharaj, Frank Hutter, Atılım Güneş Baydin, Sheila McIlraith, Qiqi Gao, Ashwin Acharya, David Krueger, Anca Dragan, Philip Torr, Stuart Russell, Daniel Kahneman, Jan Brauner, and Sören Mindermann. Managing extreme ai risks amid rapid progress. <em>Science</em>, 384(6698):842–845, 2024. URL: <a class="reference external" href="https://www.science.org/doi/abs/10.1126/science.adn0117">https://www.science.org/doi/abs/10.1126/science.adn0117</a>, <a class="reference external" href="https://arxiv.org/abs/https://www.science.org/doi/pdf/10.1126/science.adn0117">arXiv:https://www.science.org/doi/pdf/10.1126/science.adn0117</a>, <a class="reference external" href="https://doi.org/10.1126/science.adn0117">doi:10.1126/science.adn0117</a>.</p>
 </div>
-<div class="citation" id="id210" role="doc-biblioentry">
+<div class="citation" id="id217" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span>BBC+24<span class="fn-bracket">]</span></span>
 <span class="backrefs">(<a role="doc-backlink" href="#id8">1</a>,<a role="doc-backlink" href="#id16">2</a>)</span>
 <p>Victoria Benjamin, Emily Braca, Israel Carter, Hafsa Kanchwala, Nava Khojasteh, Charly Landow, Yi Luo, Caroline Ma, Anna Magarelli, Rachel Mirin, Avery Moyer, Kayla Simpson, Amelia Skawinski, and Thomas Heverin. Systematically analyzing prompt injection vulnerabilities in diverse llm architectures. 2024. URL: <a class="reference external" href="https://arxiv.org/abs/2410.23308">https://arxiv.org/abs/2410.23308</a>, <a class="reference external" href="https://arxiv.org/abs/2410.23308">arXiv:2410.23308</a>.</p>
 </div>
-<div class="citation" id="id207" role="doc-biblioentry">
+<div class="citation" id="id214" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span>BMC+24<span class="fn-bracket">]</span></span>
 <span class="backrefs">(<a role="doc-backlink" href="#id7">1</a>,<a role="doc-backlink" href="#id15">2</a>)</span>
 <p>Dillon Bowen, Brendan Murphy, Will Cai, David Khachaturov, Adam Gleave, and Kellin Pelrine. Data poisoning in llms: jailbreak-tuning and scaling laws. 2024. URL: <a class="reference external" href="https://arxiv.org/abs/2408.02946">https://arxiv.org/abs/2408.02946</a>, <a class="reference external" href="https://arxiv.org/abs/2408.02946">arXiv:2408.02946</a>.</p>
 </div>
-<div class="citation" id="id226" role="doc-biblioentry">
+<div class="citation" id="id233" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id40">CMM+24</a><span class="fn-bracket">]</span></span>
 <p>Erik Cambria, Lorenzo Malandri, Fabio Mercorio, Navid Nobani, and Andrea Seveso. Xai meets llms: a survey of the relation between explainable ai and large language models. 2024. URL: <a class="reference external" href="https://arxiv.org/abs/2407.15248">https://arxiv.org/abs/2407.15248</a>, <a class="reference external" href="https://arxiv.org/abs/2407.15248">arXiv:2407.15248</a>.</p>
 </div>
-<div class="citation" id="id213" role="doc-biblioentry">
+<div class="citation" id="id220" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span>Edg24<span class="fn-bracket">]</span></span>
 <span class="backrefs">(<a role="doc-backlink" href="#id5">1</a>,<a role="doc-backlink" href="#id10">2</a>)</span>
 <p>Alec Edgington. How to exploit large language models for good or bad. <em>SIAM News</em>, 2024. URL: <a class="reference external" href="https://www.siam.org/publications/siam-news/articles/how-to-exploit-large-language-models-for-good-or-bad/">https://www.siam.org/publications/siam-news/articles/how-to-exploit-large-language-models-for-good-or-bad/</a>.</p>
 </div>
-<div class="citation" id="id215" role="doc-biblioentry">
+<div class="citation" id="id222" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span>Exa24<span class="fn-bracket">]</span></span>
 <span class="backrefs">(<a role="doc-backlink" href="#id17">1</a>,<a role="doc-backlink" href="#id19">2</a>)</span>
 <p>Exabeam. Ai regulations and llm regulations: past, present, and future. Exabeam Blog, 2024. URL: <a class="reference external" href="https://www.exabeam.com/explainers/ai-cyber-security/ai-regulations-and-llm-regulations-past-present-and-future/">https://www.exabeam.com/explainers/ai-cyber-security/ai-regulations-and-llm-regulations-past-present-and-future/</a>.</p>
 </div>
-<div class="citation" id="id208" role="doc-biblioentry">
+<div class="citation" id="id215" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id13">GRB+24</a><span class="fn-bracket">]</span></span>
 <p>Isabel O. Gallegos, Ryan A. Rossi, Joe Barrow, Md Mehrab Tanjim, Sungchul Kim, Franck Dernoncourt, Tong Yu, Ruiyi Zhang, and Nesreen K. Ahmed. Bias and fairness in large language models: a survey. 2024. URL: <a class="reference external" href="https://arxiv.org/abs/2309.00770">https://arxiv.org/abs/2309.00770</a>, <a class="reference external" href="https://arxiv.org/abs/2309.00770">arXiv:2309.00770</a>.</p>
 </div>
@@ -2864,16 +2869,16 @@ <h2><a class="toc-backref" href="#id302" role="doc-backlink"><span class="sectio
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id63">H44z</a><span class="fn-bracket">]</span></span>
 <p>Hugging Face H4. Ultrafeedback binarized dataset. 2024z. A dataset of binary preference data for training language models. URL: <a class="reference external" href="https://huggingface.co/datasets/HuggingFaceH4/ultrafeedback_binarized">https://huggingface.co/datasets/HuggingFaceH4/ultrafeedback_binarized</a>.</p>
 </div>
-<div class="citation" id="id205" role="doc-biblioentry">
+<div class="citation" id="id212" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id1">HGP+22</a><span class="fn-bracket">]</span></span>
 <p>Thomas Hartvigsen, Saadia Gabriel, Hamid Palangi, Maarten Sap, Dipankar Ray, and Ece Kamar. ToxiGen: a large-scale machine-generated dataset for adversarial and implicit hate speech detection. In Smaranda Muresan, Preslav Nakov, and Aline Villavicencio, editors, <em>Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</em>, 3309–3326. Dublin, Ireland, May 2022. Association for Computational Linguistics. URL: <a class="reference external" href="https://aclanthology.org/2022.acl-long.234">https://aclanthology.org/2022.acl-long.234</a>, <a class="reference external" href="https://doi.org/10.18653/v1/2022.acl-long.234">doi:10.18653/v1/2022.acl-long.234</a>.</p>
 </div>
-<div class="citation" id="id206" role="doc-biblioentry">
+<div class="citation" id="id213" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span>HYM+24<span class="fn-bracket">]</span></span>
 <span class="backrefs">(<a role="doc-backlink" href="#id6">1</a>,<a role="doc-backlink" href="#id12">2</a>)</span>
 <p>Lei Huang, Weijiang Yu, Weitao Ma, Weihong Zhong, Zhangyin Feng, Haotian Wang, Qianglong Chen, Weihua Peng, Xiaocheng Feng, Bing Qin, and Ting Liu. A survey on hallucination in large language models: principles, taxonomy, challenges, and open questions. <em>ACM Transactions on Information Systems</em>, November 2024. URL: <a class="reference external" href="http://dx.doi.org/10.1145/3703155">http://dx.doi.org/10.1145/3703155</a>, <a class="reference external" href="https://doi.org/10.1145/3703155">doi:10.1145/3703155</a>.</p>
 </div>
-<div class="citation" id="id228" role="doc-biblioentry">
+<div class="citation" id="id235" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span>LDW+24<span class="fn-bracket">]</span></span>
 <span class="backrefs">(<a role="doc-backlink" href="#id41">1</a>,<a role="doc-backlink" href="#id42">2</a>)</span>
 <p>Lijun Li, Bowen Dong, Ruohui Wang, Xuhao Hu, Wangmeng Zuo, Dahua Lin, Yu Qiao, and Jing Shao. Salad-bench: a hierarchical and comprehensive safety benchmark for large language models. 2024. URL: <a class="reference external" href="https://arxiv.org/abs/2402.05044">https://arxiv.org/abs/2402.05044</a>, <a class="reference external" href="https://arxiv.org/abs/2402.05044">arXiv:2402.05044</a>.</p>
@@ -2883,30 +2888,34 @@ <h2><a class="toc-backref" href="#id302" role="doc-backlink"><span class="sectio
 <span class="backrefs">(<a role="doc-backlink" href="#id47">1</a>,<a role="doc-backlink" href="#id48">2</a>)</span>
 <p>Stephanie Lin, Jacob Hilton, and Owain Evans. Truthfulqa: measuring how models mimic human falsehoods. 2022. URL: <a class="reference external" href="https://arxiv.org/abs/2109.07958">https://arxiv.org/abs/2109.07958</a>, <a class="reference external" href="https://arxiv.org/abs/2109.07958">arXiv:2109.07958</a>.</p>
 </div>
-<div class="citation" id="id234" role="doc-biblioentry">
+<div class="citation" id="id241" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span>MPY+24<span class="fn-bracket">]</span></span>
 <span class="backrefs">(<a role="doc-backlink" href="#id49">1</a>,<a role="doc-backlink" href="#id52">2</a>)</span>
 <p>Mantas Mazeika, Long Phan, Xuwang Yin, Andy Zou, Zifan Wang, Norman Mu, Elham Sakhaee, Nathaniel Li, Steven Basart, Bo Li, David Forsyth, and Dan Hendrycks. Harmbench: a standardized evaluation framework for automated red teaming and robust refusal. 2024. URL: <a class="reference external" href="https://arxiv.org/abs/2402.04249">https://arxiv.org/abs/2402.04249</a>, <a class="reference external" href="https://arxiv.org/abs/2402.04249">arXiv:2402.04249</a>.</p>
 </div>
+<div class="citation" id="id199" role="doc-biblioentry">
+<span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id30">MA24</a><span class="fn-bracket">]</span></span>
+<p>Meta-AI. Llamaguard: llm-based input-output safeguard for human-ai conversations. Meta AI Research Publications, 2024. URL: <a class="reference external" href="https://ai.meta.com/research/publications/llama-guard-llm-based-input-output-safeguard-for-human-ai-conversations/">https://ai.meta.com/research/publications/llama-guard-llm-based-input-output-safeguard-for-human-ai-conversations/</a>.</p>
+</div>
 <div class="citation" id="id167" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id31">MLC24</a><span class="fn-bracket">]</span></span>
 <p>MLCommons. Mlcommons ai illuminate benchmarks. 2024. A collection of standardized benchmarks for evaluating AI systems. URL: <a class="reference external" href="https://ailuminate.mlcommons.org/benchmarks/">https://ailuminate.mlcommons.org/benchmarks/</a>.</p>
 </div>
-<div class="citation" id="id204" role="doc-biblioentry">
+<div class="citation" id="id211" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id1">OAA+24</a><span class="fn-bracket">]</span></span>
 <p>OpenAI, Josh Achiam, Steven Adler, Sandhini Agarwal, Lama Ahmad, Ilge Akkaya, Florencia Leoni Aleman, Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, Red Avila, Igor Babuschkin, Suchir Balaji, Valerie Balcom, Paul Baltescu, Haiming Bao, Mohammad Bavarian, Jeff Belgum, Irwan Bello, Jake Berdine, Gabriel Bernadett-Shapiro, Christopher Berner, Lenny Bogdonoff, Oleg Boiko, Madelaine Boyd, Anna-Luisa Brakman, Greg Brockman, Tim Brooks, Miles Brundage, Kevin Button, Trevor Cai, Rosie Campbell, Andrew Cann, Brittany Carey, Chelsea Carlson, Rory Carmichael, Brooke Chan, Che Chang, Fotis Chantzis, Derek Chen, Sully Chen, Ruby Chen, Jason Chen, Mark Chen, Ben Chess, Chester Cho, Casey Chu, Hyung Won Chung, Dave Cummings, Jeremiah Currier, Yunxing Dai, Cory Decareaux, Thomas Degry, Noah Deutsch, Damien Deville, Arka Dhar, David Dohan, Steve Dowling, Sheila Dunning, Adrien Ecoffet, Atty Eleti, Tyna Eloundou, David Farhi, Liam Fedus, Niko Felix, Simón Posada Fishman, Juston Forte, Isabella Fulford, Leo Gao, Elie Georges, Christian Gibson, Vik Goel, Tarun Gogineni, Gabriel Goh, Rapha Gontijo-Lopes, Jonathan Gordon, Morgan Grafstein, Scott Gray, Ryan Greene, Joshua Gross, Shixiang Shane Gu, Yufei Guo, Chris Hallacy, Jesse Han, Jeff Harris, Yuchen He, Mike Heaton, Johannes Heidecke, Chris Hesse, Alan Hickey, Wade Hickey, Peter Hoeschele, Brandon Houghton, Kenny Hsu, Shengli Hu, Xin Hu, Joost Huizinga, Shantanu Jain, Shawn Jain, Joanne Jang, Angela Jiang, Roger Jiang, Haozhun Jin, Denny Jin, Shino Jomoto, Billie Jonn, Heewoo Jun, Tomer Kaftan, Łukasz Kaiser, Ali Kamali, Ingmar Kanitscheider, Nitish Shirish Keskar, Tabarak Khan, Logan Kilpatrick, Jong Wook Kim, Christina Kim, Yongjik Kim, Jan Hendrik Kirchner, Jamie Kiros, Matt Knight, Daniel Kokotajlo, Łukasz Kondraciuk, Andrew Kondrich, Aris Konstantinidis, Kyle Kosic, Gretchen Krueger, Vishal Kuo, Michael Lampe, Ikai Lan, Teddy Lee, Jan Leike, Jade Leung, Daniel Levy, Chak Ming Li, Rachel Lim, Molly Lin, Stephanie Lin, Mateusz Litwin, Theresa Lopez, Ryan Lowe, Patricia Lue, Anna Makanju, Kim Malfacini, Sam Manning, Todor Markov, Yaniv Markovski, Bianca Martin, Katie Mayer, Andrew Mayne, Bob McGrew, Scott Mayer McKinney, Christine McLeavey, Paul McMillan, Jake McNeil, David Medina, Aalok Mehta, Jacob Menick, Luke Metz, Andrey Mishchenko, Pamela Mishkin, Vinnie Monaco, Evan Morikawa, Daniel Mossing, Tong Mu, Mira Murati, Oleg Murk, David Mély, Ashvin Nair, Reiichiro Nakano, Rajeev Nayak, Arvind Neelakantan, Richard Ngo, Hyeonwoo Noh, Long Ouyang, Cullen O'Keefe, Jakub Pachocki, Alex Paino, Joe Palermo, Ashley Pantuliano, Giambattista Parascandolo, Joel Parish, Emy Parparita, Alex Passos, Mikhail Pavlov, Andrew Peng, Adam Perelman, Filipe de Avila Belbute Peres, Michael Petrov, Henrique Ponde de Oliveira Pinto, Michael, Pokorny, Michelle Pokrass, Vitchyr H. Pong, Tolly Powell, Alethea Power, Boris Power, Elizabeth Proehl, Raul Puri, Alec Radford, Jack Rae, Aditya Ramesh, Cameron Raymond, Francis Real, Kendra Rimbach, Carl Ross, Bob Rotsted, Henri Roussez, Nick Ryder, Mario Saltarelli, Ted Sanders, Shibani Santurkar, Girish Sastry, Heather Schmidt, David Schnurr, John Schulman, Daniel Selsam, Kyla Sheppard, Toki Sherbakov, Jessica Shieh, Sarah Shoker, Pranav Shyam, Szymon Sidor, Eric Sigler, Maddie Simens, Jordan Sitkin, Katarina Slama, Ian Sohl, Benjamin Sokolowsky, Yang Song, Natalie Staudacher, Felipe Petroski Such, Natalie Summers, Ilya Sutskever, Jie Tang, Nikolas Tezak, Madeleine B. Thompson, Phil Tillet, Amin Tootoonchian, Elizabeth Tseng, Preston Tuggle, Nick Turley, Jerry Tworek, Juan Felipe Cerón Uribe, Andrea Vallone, Arun Vijayvergiya, Chelsea Voss, Carroll Wainwright, Justin Jay Wang, Alvin Wang, Ben Wang, Jonathan Ward, Jason Wei, CJ Weinmann, Akila Welihinda, Peter Welinder, Jiayi Weng, Lilian Weng, Matt Wiethoff, Dave Willner, Clemens Winter, Samuel Wolrich, Hannah Wong, Lauren Workman, Sherwin Wu, Jeff Wu, Michael Wu, Kai Xiao, Tao Xu, Sarah Yoo, Kevin Yu, Qiming Yuan, Wojciech Zaremba, Rowan Zellers, Chong Zhang, Marvin Zhang, Shengjia Zhao, Tianhao Zheng, Juntang Zhuang, William Zhuk, and Barret Zoph. Gpt-4 technical report. 2024. URL: <a class="reference external" href="https://arxiv.org/abs/2303.08774">https://arxiv.org/abs/2303.08774</a>, <a class="reference external" href="https://arxiv.org/abs/2303.08774">arXiv:2303.08774</a>.</p>
 </div>
-<div class="citation" id="id247" role="doc-biblioentry">
+<div class="citation" id="id254" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span>PNC+24<span class="fn-bracket">]</span></span>
 <span class="backrefs">(<a role="doc-backlink" href="#id61">1</a>,<a role="doc-backlink" href="#id62">2</a>)</span>
 <p>Inkit Padhi, Manish Nagireddy, Giandomenico Cornacchia, Subhajit Chaudhury, Tejaswini Pedapati, Pierre Dognin, Keerthiram Murugesan, Erik Miehling, Martín Santillán Cooper, Kieran Fraser, Giulio Zizzo, Muhammad Zaid Hameed, Mark Purcell, Michael Desmond, Qian Pan, Inge Vejsbjerg, Elizabeth M. Daly, Michael Hind, Werner Geyer, Ambrish Rawat, Kush R. Varshney, and Prasanna Sattigeri. Granite guardian. 2024. URL: <a class="reference external" href="https://arxiv.org/abs/2412.07724">https://arxiv.org/abs/2412.07724</a>, <a class="reference external" href="https://arxiv.org/abs/2412.07724">arXiv:2412.07724</a>.</p>
 </div>
-<div class="citation" id="id225" role="doc-biblioentry">
+<div class="citation" id="id232" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span>PHS+22<span class="fn-bracket">]</span></span>
 <span class="backrefs">(<a role="doc-backlink" href="#id36">1</a>,<a role="doc-backlink" href="#id37">2</a>)</span>
 <p>Ethan Perez, Saffron Huang, Francis Song, Trevor Cai, Roman Ring, John Aslanides, Amelia Glaese, Nat McAleese, and Geoffrey Irving. Red teaming language models with language models. 2022. URL: <a class="reference external" href="https://arxiv.org/abs/2202.03286">https://arxiv.org/abs/2202.03286</a>, <a class="reference external" href="https://arxiv.org/abs/2202.03286">arXiv:2202.03286</a>.</p>
 </div>
-<div class="citation" id="id214" role="doc-biblioentry">
+<div class="citation" id="id221" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id9">SZW+24</a><span class="fn-bracket">]</span></span>
 <p>Oliver J. Sutton, Qinghua Zhou, Wei Wang, Desmond J. Higham, Alexander N. Gorban, Alexander Bastounis, and Ivan Y. Tyukin. Stealth edits to large language models. 2024. URL: <a class="reference external" href="https://arxiv.org/abs/2406.12670">https://arxiv.org/abs/2406.12670</a>, <a class="reference external" href="https://arxiv.org/abs/2406.12670">arXiv:2406.12670</a>.</p>
 </div>
@@ -2915,102 +2924,102 @@ <h2><a class="toc-backref" href="#id302" role="doc-backlink"><span class="sectio
 <span class="backrefs">(<a role="doc-backlink" href="#id29">1</a>,<a role="doc-backlink" href="#id32">2</a>)</span>
 <p>Bertie Vidgen, Adarsh Agrawal, Ahmed M. Ahmed, Victor Akinwande, Namir Al-Nuaimi, Najla Alfaraj, Elie Alhajjar, Lora Aroyo, Trupti Bavalatti, Max Bartolo, Borhane Blili-Hamelin, Kurt Bollacker, Rishi Bomassani, Marisa Ferrara Boston, Siméon Campos, Kal Chakra, Canyu Chen, Cody Coleman, Zacharie Delpierre Coudert, Leon Derczynski, Debojyoti Dutta, Ian Eisenberg, James Ezick, Heather Frase, Brian Fuller, Ram Gandikota, Agasthya Gangavarapu, Ananya Gangavarapu, James Gealy, Rajat Ghosh, James Goel, Usman Gohar, Sujata Goswami, Scott A. Hale, Wiebke Hutiri, Joseph Marvin Imperial, Surgan Jandial, Nick Judd, Felix Juefei-Xu, Foutse Khomh, Bhavya Kailkhura, Hannah Rose Kirk, Kevin Klyman, Chris Knotz, Michael Kuchnik, Shachi H. Kumar, Srijan Kumar, Chris Lengerich, Bo Li, Zeyi Liao, Eileen Peters Long, Victor Lu, Sarah Luger, Yifan Mai, Priyanka Mary Mammen, Kelvin Manyeki, Sean McGregor, Virendra Mehta, Shafee Mohammed, Emanuel Moss, Lama Nachman, Dinesh Jinenhally Naganna, Amin Nikanjam, Besmira Nushi, Luis Oala, Iftach Orr, Alicia Parrish, Cigdem Patlak, William Pietri, Forough Poursabzi-Sangdeh, Eleonora Presani, Fabrizio Puletti, Paul Röttger, Saurav Sahay, Tim Santos, Nino Scherrer, Alice Schoenauer Sebag, Patrick Schramowski, Abolfazl Shahbazi, Vin Sharma, Xudong Shen, Vamsi Sistla, Leonard Tang, Davide Testuggine, Vithursan Thangarasa, Elizabeth Anne Watkins, Rebecca Weiss, Chris Welty, Tyler Wilbers, Adina Williams, Carole-Jean Wu, Poonam Yadav, Xianjun Yang, Yi Zeng, Wenhui Zhang, Fedor Zhdanov, Jiacheng Zhu, Percy Liang, Peter Mattson, and Joaquin Vanschoren. Introducing v0.5 of the ai safety benchmark from mlcommons. 2024. URL: <a class="reference external" href="https://arxiv.org/abs/2404.12241">https://arxiv.org/abs/2404.12241</a>, <a class="reference external" href="https://arxiv.org/abs/2404.12241">arXiv:2404.12241</a>.</p>
 </div>
-<div class="citation" id="id203" role="doc-biblioentry">
+<div class="citation" id="id210" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span>VSK+24<span class="fn-bracket">]</span></span>
 <span class="backrefs">(<a role="doc-backlink" href="#id2">1</a>,<a role="doc-backlink" href="#id3">2</a>)</span>
 <p>Bertie Vidgen, Nino Scherrer, Hannah Rose Kirk, Rebecca Qian, Anand Kannappan, Scott A. Hale, and Paul Röttger. Simplesafetytests: a test suite for identifying critical safety risks in large language models. 2024. URL: <a class="reference external" href="https://arxiv.org/abs/2311.08370">https://arxiv.org/abs/2311.08370</a>, <a class="reference external" href="https://arxiv.org/abs/2311.08370">arXiv:2311.08370</a>.</p>
 </div>
-<div class="citation" id="id219" role="doc-biblioentry">
+<div class="citation" id="id226" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id35">WMR24</a><span class="fn-bracket">]</span></span>
 <p>Sandra Wachter, Brent Mittelstadt, and Chris Russell. Do large language models have a legal duty to tell the truth? <em>Royal Society Open Science</em>, 11(8):240197, 2024. URL: <a class="reference external" href="https://royalsocietypublishing.org/doi/abs/10.1098/rsos.240197">https://royalsocietypublishing.org/doi/abs/10.1098/rsos.240197</a>, <a class="reference external" href="https://arxiv.org/abs/https://royalsocietypublishing.org/doi/pdf/10.1098/rsos.240197">arXiv:https://royalsocietypublishing.org/doi/pdf/10.1098/rsos.240197</a>, <a class="reference external" href="https://doi.org/10.1098/rsos.240197">doi:10.1098/rsos.240197</a>.</p>
 </div>
-<div class="citation" id="id231" role="doc-biblioentry">
+<div class="citation" id="id238" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id46">YLX24</a><span class="fn-bracket">]</span></span>
 <p>Jiahao Yu, Xingwei Lin, and Xinyu Xing. Gptfuzzer: red teaming large language models with auto-generated safety test cases. Papers with Code, 2024. URL: <a class="reference external" href="https://paperswithcode.com/dataset/gptfuzzer">https://paperswithcode.com/dataset/gptfuzzer</a>.</p>
 </div>
-<div class="citation" id="id209" role="doc-biblioentry">
+<div class="citation" id="id216" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id14">ZYY+24</a><span class="fn-bracket">]</span></span>
 <p>Shuning Zhang, Lyumanshan Ye, Xin Yi, Jingyu Tang, Bo Shui, Haobin Xing, Pengfei Liu, and Hewu Li. &quot;ghost of the past&quot;: identifying and resolving privacy leakage from llm's memory through proactive user interaction. 2024. URL: <a class="reference external" href="https://arxiv.org/abs/2410.14931">https://arxiv.org/abs/2410.14931</a>, <a class="reference external" href="https://arxiv.org/abs/2410.14931">arXiv:2410.14931</a>.</p>
 </div>
-<div class="citation" id="id212" role="doc-biblioentry">
+<div class="citation" id="id219" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id11">Zho24</a><span class="fn-bracket">]</span></span>
 <p>Qinghua Zhou. Stealth edits: detecting stealth edits in llm outputs. Hugging Face Spaces, 2024. URL: <a class="reference external" href="https://huggingface.co/spaces/qinghua-zhou/stealth-edits">https://huggingface.co/spaces/qinghua-zhou/stealth-edits</a>.</p>
 </div>
-<div class="citation" id="id245" role="doc-biblioentry">
+<div class="citation" id="id252" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id56">AmazonWServices24</a><span class="fn-bracket">]</span></span>
 <p>Amazon Web Services. Amazon comprehend - natural language processing service. 2024. AWS natural language processing service for text analysis and content moderation. URL: <a class="reference external" href="https://aws.amazon.com/comprehend/">https://aws.amazon.com/comprehend/</a>.</p>
 </div>
-<div class="citation" id="id223" role="doc-biblioentry">
+<div class="citation" id="id230" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id26">Anthropic24</a><span class="fn-bracket">]</span></span>
 <p>Anthropic. Anthropic's responsible scaling policy. Technical Report, Anthropic, 2024. URL: <a class="reference external" href="https://www-cdn.anthropic.com/1adf000c8f675958c2ee23805d91aaade1cd4613/responsible-scaling-policy.pdf">https://www-cdn.anthropic.com/1adf000c8f675958c2ee23805d91aaade1cd4613/responsible-scaling-policy.pdf</a>.</p>
 </div>
-<div class="citation" id="id235" role="doc-biblioentry">
+<div class="citation" id="id242" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id50">CenterfASafety24a</a><span class="fn-bracket">]</span></span>
 <p>Center for AI Safety. Harmbench. GitHub repository, 2024. Framework for evaluating language model safety. URL: <a class="reference external" href="https://github.com/centerforaisafety/HarmBench">https://github.com/centerforaisafety/HarmBench</a>.</p>
 </div>
-<div class="citation" id="id236" role="doc-biblioentry">
+<div class="citation" id="id243" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id51">CenterfASafety24b</a><span class="fn-bracket">]</span></span>
 <p>Center for AI Safety. Harmbench leaderboard. 2024. Leaderboard tracking performance of language models on safety benchmarks. URL: <a class="reference external" href="https://www.harmbench.org/results">https://www.harmbench.org/results</a>.</p>
 </div>
-<div class="citation" id="id224" role="doc-biblioentry">
+<div class="citation" id="id231" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span>DeepMind24<span class="fn-bracket">]</span></span>
 <span class="backrefs">(<a role="doc-backlink" href="#id27">1</a>,<a role="doc-backlink" href="#id28">2</a>)</span>
 <p>DeepMind. The frontier safety framework. Technical Report, DeepMind, 2024. URL: <a class="reference external" href="https://storage.googleapis.com/deepmind-media/DeepMind.com/Blog/introducing-the-frontier-safety-framework/fsf-technical-report.pdf">https://storage.googleapis.com/deepmind-media/DeepMind.com/Blog/introducing-the-frontier-safety-framework/fsf-technical-report.pdf</a>.</p>
 </div>
-<div class="citation" id="id216" role="doc-biblioentry">
+<div class="citation" id="id223" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id19">EuropeanMAgency24</a><span class="fn-bracket">]</span></span>
 <p>European Medicines Agency. Guiding principles for the use of large language models in regulatory science and medicines regulatory activities. Guidance Document, European Medicines Agency, 2024. URL: <a class="reference external" href="https://www.ema.europa.eu/en/documents/other/guiding-principles-use-large-language-models-regulatory-science-medicines-regulatory-activities_en.pdf">https://www.ema.europa.eu/en/documents/other/guiding-principles-use-large-language-models-regulatory-science-medicines-regulatory-activities_en.pdf</a>.</p>
 </div>
-<div class="citation" id="id201" role="doc-biblioentry">
+<div class="citation" id="id208" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id18">FinancialIRAuthority24</a><span class="fn-bracket">]</span></span>
 <p>Financial Industry Regulatory Authority. Artificial intelligence, including large language models and generative ai. Regulatory Notice 24-09, FINRA, 2024. URL: <a class="reference external" href="https://www.finra.org/rules-guidance/notices/24-09">https://www.finra.org/rules-guidance/notices/24-09</a>.</p>
 </div>
-<div class="citation" id="id246" role="doc-biblioentry">
+<div class="citation" id="id253" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id60">IBM24</a><span class="fn-bracket">]</span></span>
 <p>IBM. Ibm watsonx.ai risk atlas. 2024. A framework for identifying and mitigating risks in AI systems. URL: <a class="reference external" href="https://www.ibm.com/docs/en/watsonx/saas?topic=ai-risk-atlas">https://www.ibm.com/docs/en/watsonx/saas?topic=ai-risk-atlas</a>.</p>
 </div>
-<div class="citation" id="id220" role="doc-biblioentry">
+<div class="citation" id="id227" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id22">LibraryoCongress23</a><span class="fn-bracket">]</span></span>
 <p>Library of Congress. China: generative ai measures finalized. July 2023. URL: <a class="reference external" href="https://www.loc.gov/item/global-legal-monitor/2023-07-18/china-generative-ai-measures-finalized/">https://www.loc.gov/item/global-legal-monitor/2023-07-18/china-generative-ai-measures-finalized/</a>.</p>
 </div>
-<div class="citation" id="id241" role="doc-biblioentry">
+<div class="citation" id="id248" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id58">MistralAI24</a><span class="fn-bracket">]</span></span>
 <p>Mistral AI. Mistral moderation: a technical report. 2024. URL: <a class="reference external" href="https://mistral.ai/news/mistral-moderation/">https://mistral.ai/news/mistral-moderation/</a>.</p>
 </div>
-<div class="citation" id="id233" role="doc-biblioentry">
+<div class="citation" id="id240" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id54">MLSTeam24</a><span class="fn-bracket">]</span></span>
 <p>ML Safety Team. Safebench: a comprehensive benchmark for llm safety evaluation. ML Safety Website, 2024. URL: <a class="reference external" href="https://www.mlsafety.org/safebench">https://www.mlsafety.org/safebench</a>.</p>
 </div>
-<div class="citation" id="id221" role="doc-biblioentry">
+<div class="citation" id="id228" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id23">NationalIoSaTechnology24</a><span class="fn-bracket">]</span></span>
 <p>National Institute of Standards and Technology. Ai risk management framework. Technical Report, National Institute of Standards and Technology, 2024. URL: <a class="reference external" href="https://www.nist.gov/itl/ai-risk-management-framework">https://www.nist.gov/itl/ai-risk-management-framework</a>.</p>
 </div>
-<div class="citation" id="id244" role="doc-biblioentry">
+<div class="citation" id="id251" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id57">NVIDIA24</a><span class="fn-bracket">]</span></span>
 <p>NVIDIA. Nemo-guardrails: an open-source toolkit for building reliable and safe llm applications. 2024. A framework for creating reliable and safe LLM applications with customizable guardrails. URL: <a class="reference external" href="https://github.com/NVIDIA/NeMo-Guardrails">https://github.com/NVIDIA/NeMo-Guardrails</a>.</p>
 </div>
-<div class="citation" id="id242" role="doc-biblioentry">
+<div class="citation" id="id249" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id59">OpenAI24a</a><span class="fn-bracket">]</span></span>
 <p>OpenAI. Openai moderation api. 2024. Documentation for OpenAI's content moderation API. URL: <a class="reference external" href="https://platform.openai.com/docs/guides/moderation">https://platform.openai.com/docs/guides/moderation</a>.</p>
 </div>
-<div class="citation" id="id222" role="doc-biblioentry">
+<div class="citation" id="id229" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span>OpenAI24b<span class="fn-bracket">]</span></span>
 <span class="backrefs">(<a role="doc-backlink" href="#id24">1</a>,<a role="doc-backlink" href="#id25">2</a>)</span>
 <p>OpenAI. Openai preparedness framework. Technical Report, OpenAI, 2024. URL: <a class="reference external" href="https://cdn.openai.com/openai-preparedness-framework-beta.pdf">https://cdn.openai.com/openai-preparedness-framework-beta.pdf</a>.</p>
 </div>
-<div class="citation" id="id230" role="doc-biblioentry">
+<div class="citation" id="id237" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id44">OpenSafetyLab24a</a><span class="fn-bracket">]</span></span>
 <p>OpenSafetyLab. Salad-bench leaderboard. Hugging Face Space, 2024. URL: <a class="reference external" href="https://huggingface.co/spaces/OpenSafetyLab/Salad-Bench-Leaderboard">https://huggingface.co/spaces/OpenSafetyLab/Salad-Bench-Leaderboard</a>.</p>
 </div>
-<div class="citation" id="id229" role="doc-biblioentry">
+<div class="citation" id="id236" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id45">OpenSafetyLab24b</a><span class="fn-bracket">]</span></span>
 <p>OpenSafetyLab. Salad-data: a hierarchical and comprehensive safety dataset for large language models. Hugging Face Dataset, 2024. URL: <a class="reference external" href="https://huggingface.co/datasets/OpenSafetyLab/Salad-Data">https://huggingface.co/datasets/OpenSafetyLab/Salad-Data</a>.</p>
 </div>
-<div class="citation" id="id243" role="doc-biblioentry">
+<div class="citation" id="id250" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id55">ProtectAI24</a><span class="fn-bracket">]</span></span>
 <p>ProtectAI. Llm-guard: comprehensive safety and security framework for large language models. 2024. An open-source toolkit for LLM security and safety. URL: <a class="reference external" href="https://github.com/protectai/llm-guard">https://github.com/protectai/llm-guard</a>.</p>
 </div>
-<div class="citation" id="id238" role="doc-biblioentry">
+<div class="citation" id="id245" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id64">SurgeAI24</a><span class="fn-bracket">]</span></span>
 <p>Surge AI. Surge ai profanity dataset. GitHub repository, 2024. A comprehensive dataset for training and evaluating profanity detection models. URL: <a class="reference external" href="https://github.com/surge-ai/profanity">https://github.com/surge-ai/profanity</a>.</p>
 </div>
@@ -3018,7 +3027,7 @@ <h2><a class="toc-backref" href="#id302" role="doc-backlink"><span class="sectio
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id21">UKGovernment24</a><span class="fn-bracket">]</span></span>
 <p>UK Government. Ai regulation: a pro-innovation approach. White Paper, Department for Science, Innovation and Technology, 2024. URL: <a class="reference external" href="https://www.gov.uk/government/publications/ai-regulation-a-pro-innovation-approach/white-paper">https://www.gov.uk/government/publications/ai-regulation-a-pro-innovation-approach/white-paper</a>.</p>
 </div>
-<div class="citation" id="id218" role="doc-biblioentry">
+<div class="citation" id="id225" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id20">UNICEF24</a><span class="fn-bracket">]</span></span>
 <p>UNICEF. Policy guidance on ai for children. Policy Report, UNICEF Office of Research - Innocenti, 2024. URL: <a class="reference external" href="https://www.unicef.org/innocenti/reports/policy-guidance-ai-children">https://www.unicef.org/innocenti/reports/policy-guidance-ai-children</a>.</p>
 </div>
@@ -3051,12 +3060,12 @@ <h2><a class="toc-backref" href="#id302" role="doc-backlink"><span class="sectio
           <div class="page-nav">
             <div class="inner"><ul class="page-nav">
   <li class="prev">
-    <a href="structured_output.html"
-       title="previous chapter">← <span class="section-number">4. </span>Structured Output</a>
+    <a href="input.html"
+       title="previous chapter">← <span class="section-number">5. </span>Managing Input Data</a>
   </li>
   <li class="next">
     <a href="alignment.html"
-       title="next chapter"><span class="section-number">6. </span>Preference-Based Alignment →</a>
+       title="next chapter"><span class="section-number">7. </span>Preference-Based Alignment →</a>
   </li>
 </ul><div class="footer" role="contentinfo">
       &#169; Copyright Tharsis T. P. Souza, 2024.
diff --git a/tamingllms/_build/html/notebooks/structured_output.html b/tamingllms/_build/html/notebooks/structured_output.html
index 3b24713..9cb9f0a 100644
--- a/tamingllms/_build/html/notebooks/structured_output.html
+++ b/tamingllms/_build/html/notebooks/structured_output.html
@@ -39,7 +39,7 @@
     
   <link rel="index" title="Index" href="../genindex.html" />
   <link rel="search" title="Search" href="../search.html" />
-  <link rel="next" title="5. Safety" href="safety.html" />
+  <link rel="next" title="5. Managing Input Data" href="input.html" />
   <link rel="prev" title="3. The Evals Gap" href="evals.html" /> 
   </head>
 
@@ -181,6 +181,15 @@
           </li>
 
         
+          <li class="toctree-l1 ">
+            
+              <a href="input.html" class="reference internal ">Managing Input Data</a>
+            
+
+            
+          </li>
+
+        
           <li class="toctree-l1 ">
             
               <a href="safety.html" class="reference internal ">Safety</a>
@@ -239,8 +248,8 @@
        title="previous chapter">← <span class="section-number">3. </span>The Evals Gap</a>
   </li>
   <li class="next">
-    <a href="safety.html"
-       title="next chapter"><span class="section-number">5. </span>Safety →</a>
+    <a href="input.html"
+       title="next chapter"><span class="section-number">5. </span>Managing Input Data →</a>
   </li>
 </ul>
   
@@ -249,7 +258,7 @@
           <div class="content" role="main" v-pre>
             
   <section class="tex2jax_ignore mathjax_ignore" id="structured-output">
-<span id="structure"></span><h1><a class="toc-backref" href="#id201" role="doc-backlink"><span class="section-number">4. </span>Structured Output</a><a class="headerlink" href="#structured-output" title="Permalink to this heading">¶</a></h1>
+<span id="structure"></span><h1><a class="toc-backref" href="#id208" role="doc-backlink"><span class="section-number">4. </span>Structured Output</a><a class="headerlink" href="#structured-output" title="Permalink to this heading">¶</a></h1>
 <blockquote class="epigraph">
 <div><p>In limits, there is freedom. Creativity thrives within structure.</p>
 <p class="attribution">—Julia B. Cameron</p>
@@ -257,41 +266,41 @@
 <nav class="contents" id="contents">
 <p class="topic-title">Contents</p>
 <ul class="simple">
-<li><p><a class="reference internal" href="#structured-output" id="id201">Structured Output</a></p>
+<li><p><a class="reference internal" href="#structured-output" id="id208">Structured Output</a></p>
 <ul>
-<li><p><a class="reference internal" href="#introduction" id="id202">Introduction</a></p></li>
-<li><p><a class="reference internal" href="#problem-statement" id="id203">Problem Statement</a></p></li>
-<li><p><a class="reference internal" href="#techniques" id="id204">Techniques</a></p>
+<li><p><a class="reference internal" href="#introduction" id="id209">Introduction</a></p></li>
+<li><p><a class="reference internal" href="#problem-statement" id="id210">Problem Statement</a></p></li>
+<li><p><a class="reference internal" href="#techniques" id="id211">Techniques</a></p>
 <ul>
-<li><p><a class="reference internal" href="#prompt-engineering" id="id205">Prompt Engineering</a></p></li>
-<li><p><a class="reference internal" href="#json-mode-fine-tuned" id="id206">JSON Mode (Fine-Tuned)</a></p></li>
-<li><p><a class="reference internal" href="#logit-post-processing" id="id207">Logit Post-Processing</a></p></li>
+<li><p><a class="reference internal" href="#prompt-engineering" id="id212">Prompt Engineering</a></p></li>
+<li><p><a class="reference internal" href="#json-mode-fine-tuned" id="id213">JSON Mode (Fine-Tuned)</a></p></li>
+<li><p><a class="reference internal" href="#logit-post-processing" id="id214">Logit Post-Processing</a></p></li>
 </ul>
 </li>
-<li><p><a class="reference internal" href="#tools" id="id208">Tools</a></p>
+<li><p><a class="reference internal" href="#tools" id="id215">Tools</a></p>
 <ul>
-<li><p><a class="reference internal" href="#outlines" id="id209">Outlines</a></p></li>
-<li><p><a class="reference internal" href="#langchain" id="id210">LangChain</a></p></li>
-<li><p><a class="reference internal" href="#ollama" id="id211">Ollama</a></p></li>
+<li><p><a class="reference internal" href="#outlines" id="id216">Outlines</a></p></li>
+<li><p><a class="reference internal" href="#langchain" id="id217">LangChain</a></p></li>
+<li><p><a class="reference internal" href="#ollama" id="id218">Ollama</a></p></li>
 </ul>
 </li>
-<li><p><a class="reference internal" href="#discussion" id="id212">Discussion</a></p>
+<li><p><a class="reference internal" href="#discussion" id="id219">Discussion</a></p>
 <ul>
-<li><p><a class="reference internal" href="#best-practices" id="id213">Best Practices</a></p></li>
-<li><p><a class="reference internal" href="#comparing-solutions" id="id214">Comparing Solutions</a></p></li>
-<li><p><a class="reference internal" href="#research-and-ongoing-debate" id="id215">Research and Ongoing Debate</a></p></li>
+<li><p><a class="reference internal" href="#best-practices" id="id220">Best Practices</a></p></li>
+<li><p><a class="reference internal" href="#comparing-solutions" id="id221">Comparing Solutions</a></p></li>
+<li><p><a class="reference internal" href="#research-and-ongoing-debate" id="id222">Research and Ongoing Debate</a></p></li>
 </ul>
 </li>
-<li><p><a class="reference internal" href="#conclusion" id="id216">Conclusion</a></p></li>
-<li><p><a class="reference internal" href="#acknowledgements" id="id217">Acknowledgements</a></p></li>
-<li><p><a class="reference internal" href="#citation" id="id218">Citation</a></p></li>
-<li><p><a class="reference internal" href="#references" id="id219">References</a></p></li>
+<li><p><a class="reference internal" href="#conclusion" id="id223">Conclusion</a></p></li>
+<li><p><a class="reference internal" href="#acknowledgements" id="id224">Acknowledgements</a></p></li>
+<li><p><a class="reference internal" href="#citation" id="id225">Citation</a></p></li>
+<li><p><a class="reference internal" href="#references" id="id226">References</a></p></li>
 </ul>
 </li>
 </ul>
 </nav>
 <section id="introduction">
-<h2><a class="toc-backref" href="#id202" role="doc-backlink"><span class="section-number">4.1. </span>Introduction</a><a class="headerlink" href="#introduction" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id209" role="doc-backlink"><span class="section-number">4.1. </span>Introduction</a><a class="headerlink" href="#introduction" title="Permalink to this heading">¶</a></h2>
 <p>Language Models excel at generating human-like text, but they often struggle to produce output in a structured format, consistently. This poses a significant challenge when we need LLMs to generate data that can be easily processed by downstream systems, such as databases, APIs, or other software applications. Sometimes, even with a well-crafted prompt, an LLM might produce an unstructured response when a structured one is expected. This can be particularly challenging when integrating LLMs into systems that require specific data formats.</p>
 <p>What user needs drive the demand for LLM output constraints when building LLM-based applications? In a recent work by Google Research <span id="id1">[<a class="reference internal" href="#id51" title="Michael Xieyang Liu, Frederick Liu, Alexander J. Fiannaca, Terry Koo, Lucas Dixon, Michael Terry, and Carrie J. Cai. &quot;we need structured output&quot;: towards user-centered constraints on large language model output. In Extended Abstracts of the CHI Conference on Human Factors in Computing Systems, CHI EA '24. New York, NY, USA, 2024. Association for Computing Machinery. URL: https://doi.org/10.1145/3613905.3650756, doi:10.1145/3613905.3650756.">Liu <em>et al.</em>, 2024</a>]</span>, the authors explore the user need for constraints on the output of large language models, drawing on a survey of 51 industry professionals who use LLMs in their work. These needs can be broadly categorized as follows:</p>
 <p><strong>1. Improving Developer Efficiency and Workflow</strong></p>
@@ -315,7 +324,7 @@ <h2><a class="toc-backref" href="#id202" role="doc-backlink"><span class="sectio
 <p>It is important to emphasize that the ability to constrain LLM output is not just a technical consideration but a fundamental user need, impacting developer efficiency, user experience, and the overall success of LLM-powered applications.</p>
 </section>
 <section id="problem-statement">
-<h2><a class="toc-backref" href="#id203" role="doc-backlink"><span class="section-number">4.2. </span>Problem Statement</a><a class="headerlink" href="#problem-statement" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id210" role="doc-backlink"><span class="section-number">4.2. </span>Problem Statement</a><a class="headerlink" href="#problem-statement" title="Permalink to this heading">¶</a></h2>
 <p>Language models based on the Transformer architecture are next token prediction machines.
 These models calculate the probability of observing a token (from a vocabulary of size <span class="math notranslate nohighlight">\(n\)</span>) conditioned on the previous tokens in the sequence. This process can be expressed mathematically as:</p>
 <div class="math notranslate nohighlight">
@@ -335,7 +344,7 @@ <h2><a class="toc-backref" href="#id203" role="doc-backlink"><span class="sectio
 </ul>
 </section>
 <section id="techniques">
-<h2><a class="toc-backref" href="#id204" role="doc-backlink"><span class="section-number">4.3. </span>Techniques</a><a class="headerlink" href="#techniques" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id211" role="doc-backlink"><span class="section-number">4.3. </span>Techniques</a><a class="headerlink" href="#techniques" title="Permalink to this heading">¶</a></h2>
 <p>There are many techniques to obtain structured output from LLMs <span id="id3">[<a class="reference internal" href="#id19" title="Xun Liang, Hanyu Wang, Yezhaohui Wang, Shichao Song, Jiawei Yang, Simin Niu, Jie Hu, Dan Liu, Shunyu Yao, Feiyu Xiong, and Zhiyu Li. Controllable text generation for large language models: a survey. 2024. URL: https://arxiv.org/abs/2408.12599, arXiv:2408.12599.">Liang <em>et al.</em>, 2024</a>]</span>. They can be broadly categorized into two types based on the phase they are applied to:</p>
 <ol class="arabic simple">
 <li><p><strong>Training-Time Techniques (TTT)</strong>: These techniques are applied during the training or post-training phases of the LLM. They are used to guide the model to learn the specific patterns and structures that are required for the task at hand.</p></li>
@@ -362,7 +371,7 @@ <h2><a class="toc-backref" href="#id204" role="doc-backlink"><span class="sectio
 </li>
 </ul>
 <section id="prompt-engineering">
-<h3><a class="toc-backref" href="#id205" role="doc-backlink"><span class="section-number">4.3.1. </span>Prompt Engineering</a><a class="headerlink" href="#prompt-engineering" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id212" role="doc-backlink"><span class="section-number">4.3.1. </span>Prompt Engineering</a><a class="headerlink" href="#prompt-engineering" title="Permalink to this heading">¶</a></h3>
 <p>In one-shot prompting, you provide a single example of the desired output format within the prompt.</p>
 <p>As a motivating example, consider the following simple task: Given a segment of a SEC financial filing, generate a two-person discussion about the key financial data from the text in JSON format, simulating what would be a real-world discussion about the underlying companies’ disclosed financial information. We would like to generate a structured output that can be easily parsed and integrated with other systems.</p>
 <div class="cell docutils container">
@@ -467,7 +476,7 @@ <h3><a class="toc-backref" href="#id205" role="doc-backlink"><span class="sectio
 </div>
 </section>
 <section id="json-mode-fine-tuned">
-<h3><a class="toc-backref" href="#id206" role="doc-backlink"><span class="section-number">4.3.2. </span>JSON Mode (Fine-Tuned)</a><a class="headerlink" href="#json-mode-fine-tuned" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id213" role="doc-backlink"><span class="section-number">4.3.2. </span>JSON Mode (Fine-Tuned)</a><a class="headerlink" href="#json-mode-fine-tuned" title="Permalink to this heading">¶</a></h3>
 <p>One-shot prompting is a simple technique that can lead to material improvements in structured output, though may not be sufficient for complex (e.g. nested) structures and / or when the model’s output needs to be restricted to a specific set of options or types.</p>
 <p>Some models offer so-called “JSON Mode” as an attempt to handle those challenges, which are a form of fine-tuning, hence while useful it is not guaranteed to work for all models.</p>
 <p>JSON mode is a feature provided by most LLM API providers, such as OpenAI, that allows the model to generate output in JSON format. This is particularly useful when you need structured data as a result, such as when parsing the output programmatically or integrating it with other systems that require JSON input. As depicted in <a class="reference internal" href="#json-mode"><span class="std std-numref">Fig. 4.1</span></a>, JSON mode is implemented by instructing the LLM model to use JSON as response format and optionally defining a target schema.</p>
@@ -594,7 +603,7 @@ <h3><a class="toc-backref" href="#id206" role="doc-backlink"><span class="sectio
 <p>This example solution is specific to OpenAI’s API. That begs the question: How can we solve this problem generally for widely available LLM providers? Enters logit post-processing.</p>
 </section>
 <section id="logit-post-processing">
-<h3><a class="toc-backref" href="#id207" role="doc-backlink"><span class="section-number">4.3.3. </span>Logit Post-Processing</a><a class="headerlink" href="#logit-post-processing" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id214" role="doc-backlink"><span class="section-number">4.3.3. </span>Logit Post-Processing</a><a class="headerlink" href="#logit-post-processing" title="Permalink to this heading">¶</a></h3>
 <p>Logit post-processing is a technique that involves modifying the logits of the LLM’s output before it is converted into text.</p>
 <p>The text generation process follows a probabilistic approach. At each step, the model calculates the probability distribution over its entire vocabulary to determine the most likely next token.</p>
 <p>Let’s examine how an LLM processes an example prompt “Is Enzo a good name for a baby?” as depicted in <a class="reference internal" href="#logit"><span class="std std-numref">Fig. 4.2</span></a>:</p>
@@ -829,9 +838,9 @@ <h3><a class="toc-backref" href="#id207" role="doc-backlink"><span class="sectio
 </section>
 </section>
 <section id="tools">
-<h2><a class="toc-backref" href="#id208" role="doc-backlink"><span class="section-number">4.4. </span>Tools</a><a class="headerlink" href="#tools" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id215" role="doc-backlink"><span class="section-number">4.4. </span>Tools</a><a class="headerlink" href="#tools" title="Permalink to this heading">¶</a></h2>
 <section id="outlines">
-<h3><a class="toc-backref" href="#id209" role="doc-backlink"><span class="section-number">4.4.1. </span>Outlines</a><a class="headerlink" href="#outlines" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id216" role="doc-backlink"><span class="section-number">4.4.1. </span>Outlines</a><a class="headerlink" href="#outlines" title="Permalink to this heading">¶</a></h3>
 <p>Outlines <span id="id4">[<a class="reference internal" href="#id25" title="Outlines. Type-safe structured output from llms. https://dottxt-ai.github.io/outlines/latest/, 2024. Accessed: 2024.">Outlines, 2024</a>]</span> is a library specifically focused on structured text generation from LLMs. Under the hood, Outlines works by adjusting the probability distribution of the model’s output logits - the raw scores from the final layer of the neural network that are normally converted into text tokens. By introducing carefully crafted logit biases, Outlines can guide the model to prefer certain tokens over others, effectively constraining its outputs to a predefined set of valid options.</p>
 <p>The authors solve the general guided generation problem <span id="id5">[<a class="reference internal" href="#id74" title="Brandon T. Willard and Rémi Louf. Efficient guided generation for large language models. 2023. URL: https://arxiv.org/abs/2307.09702, arXiv:2307.09702.">Willard and Louf, 2023</a>]</span>, which as a consequence solves the problem of structured output generation, in LLMs by introducing an efficient indexing approach that reformulates neural text generation using finite-state machines (FSMs).</p>
 <p>They define the next token generation as a random variable:</p>
@@ -978,7 +987,7 @@ <h3><a class="toc-backref" href="#id209" role="doc-backlink"><span class="sectio
 <p>You can also use Outlines with LangChain <span id="id7">[<a class="reference internal" href="#id108" title="LangChain. Outlines integration documentation. Online Documentation, 2024b. Documentation on integrating Outlines library with LangChain for structured generation. URL: https://python.langchain.com/docs/integrations/chat/outlines/.">LangChain, 2024b</a>]</span>.</p>
 </section>
 <section id="langchain">
-<h3><a class="toc-backref" href="#id210" role="doc-backlink"><span class="section-number">4.4.2. </span>LangChain</a><a class="headerlink" href="#langchain" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id217" role="doc-backlink"><span class="section-number">4.4.2. </span>LangChain</a><a class="headerlink" href="#langchain" title="Permalink to this heading">¶</a></h3>
 <p>LangChain is a framework designed to simplify the development of LLM applications. It provider an abstraction layer over many LLM providers, including OpenAI, that offers several tools for parsing structured output.</p>
 <p>In particular, LangChain offers the <code class="docutils literal notranslate"><span class="pre">with_structured_output</span></code> method, which can be used with LLMs that support structured output APIs, allowing you to enforce a schema directly within the prompt.</p>
 <blockquote>
@@ -1036,7 +1045,7 @@ <h3><a class="toc-backref" href="#id210" role="doc-backlink"><span class="sectio
 <p>We observe that the model was able to extract the entities and places from the input text, and return them in the specified format. A full list of models that support <code class="docutils literal notranslate"><span class="pre">.with_structured_output()</span></code> can be found <a class="reference external" href="https://python.langchain.com/docs/integrations/chat/#featured-providers">here</a>.</p>
 </section>
 <section id="ollama">
-<h3><a class="toc-backref" href="#id211" role="doc-backlink"><span class="section-number">4.4.3. </span>Ollama</a><a class="headerlink" href="#ollama" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id218" role="doc-backlink"><span class="section-number">4.4.3. </span>Ollama</a><a class="headerlink" href="#ollama" title="Permalink to this heading">¶</a></h3>
 <p>Ollama is a popular tool that allows you to run large language models (LLMs) locally. It has recently added support for structured output generation. The current <code class="docutils literal notranslate"><span class="pre">ollama</span></code> implementation leverages llama.cpp GBNF (GGML BNF) grammars <span id="id8">[<a class="reference internal" href="#id49" title="Ggerganov. Llama.cpp grammars documentation. https://github.com/ggerganov/llama.cpp/blob/master/grammars/README.md, 2024. Accessed: 2024.">Ggerganov, 2024</a>]</span> to enable structured output generation.</p>
 <p>llama.cpp GBNF forces language models to generate output in specific, predefined formats by constraining their outputs to follow precise rules and patterns. The system accomplishes this through a formal grammar specification that defines exactly how valid outputs can be constructed. It’s essentially an extension of BNF (Backus-Naur Form) <span id="id9">[<a class="reference internal" href="#id50" title="Wikipedia contributors. Backus naur form. https://en.wiktionary.org/wiki/Backus-Naur_form, 2024. Accessed: 2024.">Wikipedia contributors, 2024</a>]</span> with some modern regex-like features added. These rules carefully define what elements are allowed, how they can be combined, and what patterns of repetition and sequencing are valid. By enforcing these constraints during generation, GBNF ensures the model’s output strictly adheres to the desired format.</p>
 <p>Ollama first introduced structured output generation in version 0.5.1 providing support for JSON output but highlighting additional formats are coming soon.</p>
@@ -1134,9 +1143,9 @@ <h3><a class="toc-backref" href="#id211" role="doc-backlink"><span class="sectio
 </section>
 </section>
 <section id="discussion">
-<h2><a class="toc-backref" href="#id212" role="doc-backlink"><span class="section-number">4.5. </span>Discussion</a><a class="headerlink" href="#discussion" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id219" role="doc-backlink"><span class="section-number">4.5. </span>Discussion</a><a class="headerlink" href="#discussion" title="Permalink to this heading">¶</a></h2>
 <section id="best-practices">
-<h3><a class="toc-backref" href="#id213" role="doc-backlink"><span class="section-number">4.5.1. </span>Best Practices</a><a class="headerlink" href="#best-practices" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id220" role="doc-backlink"><span class="section-number">4.5.1. </span>Best Practices</a><a class="headerlink" href="#best-practices" title="Permalink to this heading">¶</a></h3>
 <p>When implementing structured output with LLMs, it’s crucial to understand the distinction between different approaches. Some methods, like Outlines’ logit post-processing, provide mathematical guarantees that the output will conform to the specified structure. These contrast sharply with approaches like JSON mode, which rely on fine-tuned models or prompting that offer no formal guarantees. This distinction becomes particularly important in production environments where reliability and consistency are paramount. With that in mind, here are some best practices to consider when implementing structured output with LLMs:</p>
 <ul class="simple">
 <li><p><strong>Clear Schema Definition</strong>: Define the desired output structure clearly. This can be done in several ways including schemas, types, or Pydantic models as appropriate. This ensures the LLM knows exactly what format is expected.</p></li>
@@ -1146,7 +1155,7 @@ <h3><a class="toc-backref" href="#id213" role="doc-backlink"><span class="sectio
 <p>In summary, first one needs to clearly define the typed structure LLM applications will interface with, then determine whether strong guarantees are needed in order to determine tradeoffs between control and ease of implementation.</p>
 </section>
 <section id="comparing-solutions">
-<h3><a class="toc-backref" href="#id214" role="doc-backlink"><span class="section-number">4.5.2. </span>Comparing Solutions</a><a class="headerlink" href="#comparing-solutions" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id221" role="doc-backlink"><span class="section-number">4.5.2. </span>Comparing Solutions</a><a class="headerlink" href="#comparing-solutions" title="Permalink to this heading">¶</a></h3>
 <p>The choice of framework for structured LLM output depends heavily on specific constraints, requirements and use cases. LangChain is the most used LLM framework today with a large developer community base however its structured output support depends on the underlying LLM provider support. Ollama enables straightforward local deployment and experimentation democratizing access to LLMs while fostering privacy and control, however today it only offers JSON format with further formats to come. Outlines emerges as a solution with great flexibility and control over output structure while providing support for a wide range of LLMs. <a class="reference internal" href="#structured-output-frameworks"><span class="std std-numref">Table 4.1</span></a> provides a summary comparison of the different frameworks.</p>
 <table class="docutils align-default" id="structured-output-frameworks">
 <caption><span class="caption-number">Table 4.1 </span><span class="caption-text">Structured Output Frameworks Comparison</span><a class="headerlink" href="#structured-output-frameworks" title="Permalink to this table">¶</a></caption>
@@ -1193,7 +1202,7 @@ <h3><a class="toc-backref" href="#id214" role="doc-backlink"><span class="sectio
 <p>Other related tools not covered in this chapter worth mentioning include Guidance <span id="id10">[<a class="reference internal" href="#id96" title="Guidance AI. Guidance: language model programming. GitHub Repository, 2024. Framework for programming language models with structured templating and control flow. URL: https://github.com/guidance-ai/guidance.">Guidance AI, 2024</a>]</span> and NVIDIA’s Logits Processor Zoo <span id="id11">[<a class="reference internal" href="#id95" title="NVIDIA. Logits processor zoo. GitHub Repository, 2024a. Collection of logits processors for controlling language model generation. URL: https://github.com/NVIDIA/logits-processor-zoo.">NVIDIA, 2024a</a>]</span>.</p>
 </section>
 <section id="research-and-ongoing-debate">
-<h3><a class="toc-backref" href="#id215" role="doc-backlink"><span class="section-number">4.5.3. </span>Research and Ongoing Debate</a><a class="headerlink" href="#research-and-ongoing-debate" title="Permalink to this heading">¶</a></h3>
+<h3><a class="toc-backref" href="#id222" role="doc-backlink"><span class="section-number">4.5.3. </span>Research and Ongoing Debate</a><a class="headerlink" href="#research-and-ongoing-debate" title="Permalink to this heading">¶</a></h3>
 <p>The use of structured output for Large Language Models (LLMs) is a developing area. While the ability to constrain LLM outputs offer clear benefits in parsing, robustness, and integration, there is growing debate on whether it also potentially comes at the cost of performance as well as reasoning abilities. Research in this area should be taken with a grain of salt since findings are mixed and often depend on the specific task and model family at hand furthermore model families are not always comparable and are getting updated by the day! Nonetheless, early findings provide some interesting insights as to why there is no one-size-fits-all solution when it comes to LLMs structured output.</p>
 <p>There is some evidence indicating that LLMs may have bias in their handling of different output formats <span id="id12">[<a class="reference internal" href="#id52" title="Do Xuan Long, Hai Nguyen Ngoc, Tiviatis Sim, Hieu Dao, Shafiq Joty, Kenji Kawaguchi, Nancy F Chen, and Min-Yen Kan. Llms are biased towards output formats! systematically evaluating and mitigating output format bias of llms. arXiv preprint arXiv:2408.08656, 2024.">Long <em>et al.</em>, 2024</a>]</span>. The study examined common output structures like multiple-choice answers, wrapped text, lists, and key-value mappings. The authors analyzed key LLM model families, namely Gemma, Mistral, and ChatGPT, uncovering bias across multiple tasks and formats.  The researchers attributed these biases to the models’ underlying token distributions for different formats. An example of this format bias emerged in the comparison between JSON and YAML outputs. While models like Mistral and Gemma excelled at generating JSON structures, they performed notably worse with YAML. Their YAML outputs often contained extraneous information that degrades output quality. This disparity likely stems from JSON’s prevalence in training data, highlighting how a format’s popularity directly influences model performance. While the studied models can be probably considered outdated by now since models are getting updated on a rapidly fashion, it is important to remark that addressing format bias is critical for advancing LLMs and ensuring their reliable application in real-world scenarios.</p>
 <p>Recent research “Let Me Speak Freely? A Study on the Impact of Format Restrictions on Performance of Large Language Models” <span id="id13">[<a class="reference internal" href="#id26" title="Zhi Rui Tam, Cheng-Kuang Wu, Yi-Lin Tsai, Chieh-Yen Lin, Hung-yi Lee, and Yun-Nung Chen. Let me speak freely? a study on the impact of format restrictions on performance of large language models. 2024. URL: https://arxiv.org/abs/2408.02442, arXiv:2408.02442.">Tam <em>et al.</em>, 2024</a>]</span> suggests that imposing format restrictions on LLMs might impact their performance, particularly in reasoning-intensive tasks. Further evidence <span id="id14">[<a class="reference internal" href="#id28" title="Aider. Code in json: structured output for llms. https://aider.chat/2024/08/14/code-in-json.html, 2024. Accessed: 2024.">Aider, 2024</a>]</span> suggests LLMs may produce lower quality code if they’re asked to return it as part of a structured JSON response, in particular:</p>
@@ -1223,16 +1232,16 @@ <h3><a class="toc-backref" href="#id215" role="doc-backlink"><span class="sectio
 </section>
 </section>
 <section id="conclusion">
-<h2><a class="toc-backref" href="#id216" role="doc-backlink"><span class="section-number">4.6. </span>Conclusion</a><a class="headerlink" href="#conclusion" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id223" role="doc-backlink"><span class="section-number">4.6. </span>Conclusion</a><a class="headerlink" href="#conclusion" title="Permalink to this heading">¶</a></h2>
 <p>Extracting structured output from LLMs is crucial for integrating them into real-world applications. By understanding the challenges and employing appropriate strategies and tools, developers can improve the reliability and usability of LLM-powered systems, unlocking their potential to automate complex tasks and generate valuable insights.</p>
 <p>Prompt engineering and the use of fine-tuned models can help control the output of LLMs. However, when strong guarantees are needed, practitioners should consider techniques such as logit post-processing either by manually adjusting the model’s output logits or using frameworks like Outlines that provider a higher level of control over the generation process.</p>
 </section>
 <section id="acknowledgements">
-<h2><a class="toc-backref" href="#id217" role="doc-backlink"><span class="section-number">4.7. </span>Acknowledgements</a><a class="headerlink" href="#acknowledgements" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id224" role="doc-backlink"><span class="section-number">4.7. </span>Acknowledgements</a><a class="headerlink" href="#acknowledgements" title="Permalink to this heading">¶</a></h2>
 <p>We would like to thank <a class="reference external" href="https://x.com/cameron_pfiffer">Cameron Pfiffer</a> from the .txt team for his insightful review and feedback.</p>
 </section>
 <section id="citation">
-<h2><a class="toc-backref" href="#id218" role="doc-backlink"><span class="section-number">4.8. </span>Citation</a><a class="headerlink" href="#citation" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id225" role="doc-backlink"><span class="section-number">4.8. </span>Citation</a><a class="headerlink" href="#citation" title="Permalink to this heading">¶</a></h2>
 <p><a class="reference external" href="http://creativecommons.org/licenses/by-nc-sa/4.0/"><img alt="CC BY-NC-SA 4.0" src="https://licensebuttons.net/l/by-nc-sa/4.0/88x31.png" /></a></p>
 <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="nd">@misc</span><span class="p">{</span><span class="n">tharsistpsouza2024tamingllms</span><span class="p">,</span>
   <span class="n">author</span> <span class="o">=</span> <span class="p">{</span><span class="n">Tharsis</span> <span class="n">T</span><span class="o">.</span> <span class="n">P</span><span class="o">.</span> <span class="n">Souza</span><span class="p">},</span>
@@ -1246,7 +1255,7 @@ <h2><a class="toc-backref" href="#id218" role="doc-backlink"><span class="sectio
 </div>
 </section>
 <section id="references">
-<h2><a class="toc-backref" href="#id219" role="doc-backlink"><span class="section-number">4.9. </span>References</a><a class="headerlink" href="#references" title="Permalink to this heading">¶</a></h2>
+<h2><a class="toc-backref" href="#id226" role="doc-backlink"><span class="section-number">4.9. </span>References</a><a class="headerlink" href="#references" title="Permalink to this heading">¶</a></h2>
 <div class="docutils container" id="id17">
 <div class="citation" id="id28" role="doc-biblioentry">
 <span class="label"><span class="fn-bracket">[</span><a role="doc-backlink" href="#id14">Aid24</a><span class="fn-bracket">]</span></span>
@@ -1339,8 +1348,8 @@ <h2><a class="toc-backref" href="#id219" role="doc-backlink"><span class="sectio
        title="previous chapter">← <span class="section-number">3. </span>The Evals Gap</a>
   </li>
   <li class="next">
-    <a href="safety.html"
-       title="next chapter"><span class="section-number">5. </span>Safety →</a>
+    <a href="input.html"
+       title="next chapter"><span class="section-number">5. </span>Managing Input Data →</a>
   </li>
 </ul><div class="footer" role="contentinfo">
       &#169; Copyright Tharsis T. P. Souza, 2024.
diff --git a/tamingllms/_build/html/objects.inv b/tamingllms/_build/html/objects.inv
index fbbd7b8..c16db8e 100644
Binary files a/tamingllms/_build/html/objects.inv and b/tamingllms/_build/html/objects.inv differ
diff --git a/tamingllms/_build/html/search.html b/tamingllms/_build/html/search.html
index ec2b928..3471427 100644
--- a/tamingllms/_build/html/search.html
+++ b/tamingllms/_build/html/search.html
@@ -167,6 +167,15 @@
             
 
             
+          </li>
+
+        
+          <li class="toctree-l1 ">
+            
+              <a href="notebooks/input.html" class="reference internal ">Managing Input Data</a>
+            
+
+            
           </li>
 
         
diff --git a/tamingllms/_build/html/searchindex.js b/tamingllms/_build/html/searchindex.js
index 4ba3292..b481c6c 100644
--- a/tamingllms/_build/html/searchindex.js
+++ b/tamingllms/_build/html/searchindex.js
@@ -1 +1 @@
-Search.setIndex({"docnames": ["markdown/intro", "markdown/preface", "markdown/toc", "notebooks/alignment", "notebooks/cost", "notebooks/evals", "notebooks/local", "notebooks/safety", "notebooks/structured_output"], "filenames": ["markdown/intro.md", "markdown/preface.md", "markdown/toc.md", "notebooks/alignment.ipynb", "notebooks/cost.ipynb", "notebooks/evals.ipynb", "notebooks/local.ipynb", "notebooks/safety.ipynb", "notebooks/structured_output.ipynb"], "titles": ["<span class=\"section-number\">2. </span>About the Book", "<span class=\"section-number\">1. </span>Preface", "Taming LLMs", "<span class=\"section-number\">6. </span>Preference-Based Alignment", "<span class=\"section-number\">8. </span>The Falling Cost Paradox", "<span class=\"section-number\">3. </span>The Evals Gap", "<span class=\"section-number\">7. </span>Local LLMs in Practice", "<span class=\"section-number\">5. </span>Safety", "<span class=\"section-number\">4. </span>Structured Output"], "terms": {"am": [0, 7], "alwai": [0, 3, 4, 5, 8], "do": [0, 3, 4, 5, 6, 7, 8], "which": [0, 3, 4, 5, 6, 7, 8], "cannot": [0, 3, 4, 5, 6, 7], "order": [0, 3, 5, 7, 8], "mai": [0, 1, 3, 4, 5, 6, 7, 8], "learn": [0, 3, 5, 6, 7, 8], "how": [0, 1, 3, 4, 5, 6, 7, 8], "pablo": [0, 5], "picasso": 0, "In": [0, 3, 4, 5, 6, 7, 8], "recent": [0, 3, 4, 5, 6, 7, 8], "year": [0, 2, 3, 4, 5, 6, 7, 8], "larg": [0, 1, 2, 3, 4, 5, 6, 7, 8], "languag": [0, 1, 2, 4, 5, 6, 7, 8], "model": [0, 1, 2, 4, 7, 8], "llm": [0, 1, 3, 8], "have": [0, 1, 3, 4, 5, 6, 7, 8], "emerg": [0, 3, 4, 6, 7, 8], "transform": [0, 1, 3, 5, 6, 7, 8], "forc": [0, 5, 8], "technologi": [0, 1, 4, 5, 6, 7], "promis": [0, 3, 4, 5, 7], "revolution": [0, 7], "build": [0, 2, 3, 5, 6, 7, 8], "product": [0, 1, 2, 3, 4, 5, 6, 7, 8], "interact": [0, 3, 4, 5, 6, 7, 8], "comput": [0, 3, 4, 5, 6, 7, 8], "from": [0, 1, 4, 5, 6, 7, 8], "chatgpt": [0, 3, 4, 6, 8], "github": [0, 2, 3, 4, 5, 6, 7, 8], "copilot": 0, "claud": [0, 3, 5, 6, 7], "artifact": 0, "system": [0, 3, 4, 5, 6, 7, 8], "captur": [0, 1, 3, 5, 6, 7], "public": [0, 3, 5, 6, 7], "imagin": [0, 6], "spark": 0, "gold": [0, 3, 5, 7], "rush": 0, "ai": [0, 3, 4, 5, 6, 8], "power": [0, 2, 3, 4, 5, 6, 7, 8], "applic": [0, 1, 2, 3, 4, 6, 7, 8], "howev": [0, 3, 4, 5, 6, 7, 8], "beneath": 0, "surfac": [0, 5], "technolog": [0, 1, 4, 5, 7], "revolut": [0, 4], "li": [0, 3, 5, 6, 7, 8], "complex": [0, 1, 3, 5, 6, 7, 8], "landscap": [0, 3, 5, 6], "practition": [0, 1, 4, 5, 6, 8], "must": [0, 3, 4, 5, 6, 7, 8], "navig": [0, 2, 5, 6, 7], "focus": [0, 3, 4, 5, 6, 7, 8], "bring": [0, 3, 6], "awar": [0, 3, 4, 5, 7], "limit": [0, 1, 2, 4, 5, 6, 7, 8], "har": [0, 2, 5], "solut": [0, 2, 4, 5, 6, 7], "overcom": [0, 5], "them": [0, 1, 3, 4, 5, 6, 7, 8], "robust": [0, 3, 4, 5, 6, 7, 8], "It": [0, 3, 4, 5, 6, 7, 8], "offer": [0, 3, 4, 5, 6, 7, 8], "critic": [0, 2, 3, 4, 5, 6, 7, 8], "implement": [0, 2, 3, 4, 5, 6, 8], "back": [0, 5, 6, 7, 8], "reproduc": [0, 1, 2, 5, 6], "exampl": [0, 1, 2, 3, 5, 6, 7, 8], "while": [0, 1, 2, 3, 4, 5, 6, 7, 8], "mani": [0, 1, 3, 4, 5, 6, 7, 8], "resourc": [0, 3, 4, 5, 6, 7], "cover": [0, 3, 4, 5, 6, 7, 8], "capabl": [0, 1, 2, 4, 5, 6, 7, 8], "specif": [0, 3, 4, 5, 6, 8], "hidden": [0, 3, 7], "pitfal": [0, 1, 3, 4, 5, 6, 8], "engin": [0, 1, 2, 3, 4, 5, 6, 7], "technic": [0, 1, 2, 3, 5, 6, 8], "manag": [0, 1, 4, 5, 6, 7, 8], "face": [0, 3, 4, 5, 6, 7], "when": [0, 1, 2, 3, 4, 5, 6, 7, 8], "comprehens": [0, 2, 3, 4, 5, 6, 7, 8], "guid": [0, 1, 3, 4, 5, 6, 7, 8], "leverag": [0, 3, 5, 6, 7, 8], "battl": [0, 2], "test": [0, 2, 3, 4, 6, 7, 8], "tool": [0, 1, 3, 4], "throughout": [0, 4, 5, 6, 7], "tackl": [0, 3, 5, 7], "follow": [0, 3, 4, 5, 6, 7, 8], "non": [0, 3, 6, 7, 8], "exhaust": [0, 6], "list": [0, 3, 5, 6, 7, 8], "structur": [0, 3, 4, 5, 6, 7], "un": 0, "reliabl": [0, 1, 3, 4, 5, 6, 7, 8], "struggl": [0, 1, 3, 5, 6, 7, 8], "maintain": [0, 1, 3, 4, 5, 6, 7, 8], "consist": [0, 1, 3, 4, 5, 6, 7, 8], "output": [0, 1, 3, 5, 6, 7], "format": [0, 3, 4, 5, 6, 7, 8], "complic": [0, 7], "integr": [0, 1, 3, 4, 5, 6, 7, 8], "larger": [0, 3, 4, 5, 6, 7, 8], "make": [0, 3, 4, 5, 6, 7, 8], "error": [0, 3, 5, 7, 8], "handl": [0, 3, 4, 5, 6, 7, 8], "more": [0, 1, 3, 5, 6, 7, 8], "size": [0, 3, 5, 6, 7, 8], "length": [0, 3, 5, 6, 8], "constraint": [0, 1, 3, 4, 5, 6, 7, 8], "ar": [0, 1, 3, 4, 5, 6, 7, 8], "sensit": [0, 3, 4, 5, 6, 7], "input": [0, 3, 5, 6, 7, 8], "data": [0, 1, 4, 5, 6, 7, 8], "requir": [0, 3, 6, 7, 8], "care": [0, 3, 4, 5, 6, 7, 8], "strategi": [0, 3, 4, 5, 6, 7, 8], "long": [0, 1, 3, 4, 5, 6, 7, 8], "form": [0, 3, 4, 5, 6, 7, 8], "unstructur": [0, 6, 8], "effect": [0, 1, 3, 4, 5, 7, 8], "tradit": [0, 3, 6, 7], "softwar": [0, 1, 3, 4, 6, 7, 8], "methodologi": [0, 3, 5, 6, 7, 8], "break": [0, 1, 3, 4, 5, 7], "down": [0, 1, 4, 5, 6, 7], "deal": [0, 3, 6], "determinist": [0, 8], "gener": [0, 1, 4, 6, 8], "new": [0, 2, 3, 4, 5, 6, 7, 8], "safeti": [0, 3, 5, 8], "align": [0, 4, 5, 6, 7, 8], "can": [0, 1, 3, 4, 5, 6, 7, 8], "harm": [0, 3, 5, 6], "bias": [0, 3, 5, 6, 7, 8], "inappropri": [0, 3, 7], "safeguard": [0, 5, 7], "monitor": [0, 3, 4, 5, 6, 7], "ensur": [0, 3, 4, 5, 6, 7, 8], "safe": [0, 3, 5, 7, 8], "deploy": [0, 3, 4, 5, 7, 8], "cost": [0, 3, 5, 7, 8], "optim": [0, 1, 5, 6, 7], "The": [0, 1, 3, 7, 8], "financi": [0, 1, 3, 4, 5, 7, 8], "oper": [0, 3, 5, 6, 7, 8], "base": [0, 1, 4, 6, 8], "quickli": [0, 3, 4, 6], "becom": [0, 3, 4, 5, 6, 7, 8], "prohibit": [0, 3, 5, 6], "without": [0, 1, 3, 4, 5, 6, 7, 8], "vendor": [0, 4, 5, 6], "lock": [0, 3, 4, 6], "cloud": [0, 3, 4, 5, 6, 7, 8], "provid": [0, 2, 3, 4, 5, 6, 7, 8], "creat": [0, 1, 3, 4, 5, 6, 7, 8], "signific": [0, 3, 4, 5, 6, 7, 8], "depend": [0, 3, 4, 5, 6, 8], "through": [0, 1, 2, 3, 4, 5, 6, 7, 8], "proprietari": [0, 3, 6, 7, 8], "infrastructur": [0, 4, 6], "difficult": [0, 3, 5, 7], "switch": [0, 6], "self": [0, 3, 5, 6, 7, 8], "host": [0, 4, 5, 6, 7], "take": [0, 2, 3, 4, 5, 6, 7, 8], "hand": [0, 6, 7, 8], "focu": [0, 2, 3, 4, 5, 6, 7, 8], "access": [0, 3, 4, 5, 6, 7, 8], "all": [0, 1, 3, 4, 5, 6, 7, 8], "fulli": [0, 3, 5, 7], "document": [0, 3, 4, 5, 6, 7, 8], "allow": [0, 5, 6, 7, 8], "reader": [0, 2], "replic": [0, 5, 7, 8], "result": [0, 3, 4, 5, 7, 8], "exactli": [0, 5, 8], "design": [0, 1, 3, 6, 8], "run": [0, 3, 4, 5, 6, 7, 8], "consum": [0, 3, 4, 5, 6, 7, 8], "grade": [0, 3, 4, 5, 6, 7], "hardwar": [0, 3, 4, 5], "expens": [0, 3, 4, 5, 6, 7], "avail": [0, 3, 4, 5, 6, 7, 8], "notebook": [0, 3, 8], "modifi": [0, 3, 5, 7, 8], "extend": [0, 3, 4, 5, 6, 8], "built": [0, 5, 6, 7, 8], "us": [0, 1, 3, 4, 6, 7, 8], "free": [0, 1, 3, 5, 6, 7], "everyon": [0, 5, 6], "minim": [0, 3, 4, 5, 6, 7, 8], "framework": [0, 3, 4, 5, 6], "wai": [0, 3, 4, 5, 6, 7, 8], "priorit": [0, 3, 5, 6, 7], "transpar": [0, 3, 4, 5, 6, 7], "visibl": [0, 5], "being": [0, 3, 4, 5, 6, 7, 8], "better": [0, 2, 3, 4, 5, 6, 7], "understand": [0, 1, 2, 3, 4, 5, 6, 7, 8], "custom": [0, 3, 5, 8], "flexibl": [0, 4, 5, 6, 7, 8], "adapt": [0, 3, 4, 5, 6, 7], "case": [0, 4, 5, 8], "unlik": [0, 3, 5, 6], "black": [0, 3], "box": [0, 6], "commerci": [0, 3, 5, 6, 7, 8], "most": [0, 3, 4, 5, 6, 7, 8], "freeli": [0, 8], "foster": [0, 3, 5, 7, 8], "reduc": [0, 3, 4, 5, 6, 7, 8], "independ": [0, 5, 7, 8], "freedom": [0, 6, 8], "architectur": [0, 3, 4, 5, 6, 8], "decis": [0, 3, 4, 5, 6, 7], "keep": [0, 3, 5, 6, 7], "principl": [0, 3, 5, 6, 7], "itself": [0, 3, 5, 6, 7], "": [0, 1, 3, 4, 5, 6, 7, 8], "live": [0, 1, 5, 7], "evolv": [0, 3, 4, 5, 6, 7], "chang": [0, 3, 5, 6, 7, 8], "encourag": [0, 3, 5, 7, 8], "report": [0, 3, 5, 6, 7, 8], "suggest": [0, 3, 5, 6, 7, 8], "improv": [0, 3, 4, 5, 6, 7, 8], "contribut": [0, 4, 5, 6, 7], "via": [0, 3, 4, 5, 6, 7, 8], "pull": [0, 6], "request": [0, 3, 4, 5, 6, 7, 8], "share": [0, 3, 5, 6, 7, 8], "own": [0, 3, 4, 5, 6, 7], "experi": [0, 3, 4, 5, 6, 7, 8], "commun": [0, 3, 4, 5, 7, 8], "propos": [0, 4, 5, 7], "chapter": [0, 3, 4, 5, 6, 7, 8], "section": [0, 3, 4, 5, 6, 7, 8], "found": [0, 3, 4, 5, 6, 8], "http": [0, 1, 2, 3, 4, 5, 6, 7, 8], "com": [0, 2, 3, 4, 5, 6, 7, 8], "souzatharsi": [0, 2, 3, 4, 5, 6, 7, 8], "tamingllm": [0, 2, 3, 4, 5, 6, 7, 8], "whether": [0, 3, 4, 5, 6, 7, 8], "you": [0, 1, 3, 4, 5, 6, 7, 8], "ve": [0, 6], "typo": [0, 7], "want": [0, 1, 3, 6, 7, 8], "entir": [0, 4, 5, 6, 8], "welcom": 0, "look": [0, 2, 3, 4, 5, 6, 7], "our": [0, 1, 3, 4, 5, 6, 7, 8], "goal": [0, 1, 3, 5, 7, 8], "discourag": 0, "enabl": [0, 3, 4, 5, 6, 7, 8], "By": [0, 1, 2, 3, 5, 7, 8], "upfront": [0, 2, 4], "equip": [0, 2, 5, 7], "avoid": [0, 3, 5, 6, 7, 8], "current": [0, 2, 3, 4, 5, 7, 8], "discours": [0, 2], "around": [0, 2, 3, 5, 6, 7, 8], "tend": [0, 2, 5, 7], "toward": [0, 3, 5, 7, 8], "extrem": [0, 3, 4, 5, 7], "either": [0, 3, 5, 6, 7, 8], "uncrit": 0, "enthusiasm": 0, "wholesal": [0, 5], "dismiss": 0, "differ": [0, 3, 4, 5, 6, 7, 8], "rather": [0, 1, 3, 4, 5, 6, 7], "than": [0, 1, 3, 5, 6, 7, 8], "theoret": [0, 3], "examin": [0, 3, 5, 6, 7, 8], "first": [0, 1, 3, 4, 5, 6, 7, 8], "everi": [0, 4, 5, 7], "concept": [0, 3, 5, 7], "illustr": [0, 3, 5, 6, 7, 8], "execut": [0, 5, 6, 7], "immedi": [0, 3, 4, 5, 6], "analysi": [0, 1, 3, 4, 5, 6, 7], "balanc": [0, 3, 4, 5, 6, 7, 8], "both": [0, 3, 4, 5, 6, 7], "help": [0, 3, 4, 5, 6, 7, 8], "inform": [0, 3, 4, 5, 6, 7, 8], "intend": [0, 5, 6, 7], "develop": [0, 1, 3, 4, 5, 6, 7, 8], "step": [0, 1, 3, 4, 5, 6, 7, 8], "insight": [0, 3, 4, 5, 6, 7, 8], "along": [0, 3, 4, 5, 6, 7], "guidanc": [0, 3, 8], "could": [0, 1, 3, 4, 5, 6, 7, 8], "derail": 0, "project": [0, 3, 4, 5, 6, 7], "earli": [0, 3, 4, 5, 7, 8], "befor": [0, 3, 4, 5, 7, 8], "thei": [0, 1, 3, 4, 5, 6, 7, 8], "costli": [0, 5, 7], "problem": [0, 1, 2, 3, 4, 6, 7], "too": [0, 1, 3, 5, 6, 7], "late": [0, 3, 4, 7], "lifecycl": [0, 6, 7], "lead": [0, 1, 3, 4, 5, 6, 7, 8], "genai": [0, 1, 3, 7], "initi": [0, 1, 3, 4, 5, 6, 7, 8], "leader": [0, 2, 5], "advoc": [0, 7], "anyon": [0, 7], "seek": [0, 5, 6, 7], "work": [0, 1, 3, 5, 6, 7, 8], "typic": [0, 3, 4, 5, 6, 7, 8], "job": [0, 5, 6, 7], "role": [0, 3, 5, 6, 7, 8], "platform": [0, 5, 6, 7, 8], "backend": [0, 3, 5], "exist": [0, 3, 4, 5, 6], "ml": [0, 7], "transit": [0, 4, 5, 6, 8], "overse": 0, "motiv": [0, 3, 4, 5, 8], "need": [0, 3, 4, 5, 6, 7, 8], "readi": [0, 5, 7], "desir": [0, 3, 5, 8], "perform": [0, 3, 5, 7, 8], "after": [0, 1, 3, 5, 6, 7, 8], "read": [0, 3, 4, 5, 7, 8], "implic": [0, 1, 3, 5, 7], "recommend": [0, 3, 5, 6, 7, 8], "abl": [0, 3, 5, 8], "deploi": [0, 3, 5, 6, 7], "proper": [0, 3, 4, 6, 7, 8], "realist": [0, 3, 4, 7], "effort": [0, 5, 6, 7, 8], "estim": [0, 4, 5, 7], "impact": [0, 3, 4, 5, 6, 7, 8], "timelin": 0, "To": [0, 3, 5, 6, 7, 8], "should": [0, 3, 4, 5, 6, 7, 8], "basic": [0, 3, 5, 6, 7], "program": [0, 5, 6, 8], "knowledg": [0, 3, 5, 6, 7], "introductori": [0, 1, 2], "langchain": [0, 5], "e": [0, 1, 3, 4, 5, 6, 7, 8], "g": [0, 3, 4, 5, 6, 7, 8], "chat": [0, 3, 5, 6, 7, 8], "prompt": [0, 4, 5, 7], "templat": [0, 5, 8], "openai": [0, 3, 5, 6, 8], "anthrop": [0, 3, 8], "similar": [0, 3, 4, 5, 6, 8], "dive": [0, 4], "here": [0, 2, 3, 4, 5, 6, 7, 8], "get": [0, 3, 4, 5, 6, 7, 8], "start": [0, 3, 4, 5, 6, 7, 8], "clone": [0, 3], "companion": 0, "git": 0, "cd": 0, "activ": [0, 3, 4, 5, 6, 7], "virtual": [0, 5], "m": [0, 3, 5, 6, 7, 8], "venv": [0, 8], "tame": [0, 3, 4, 5, 6, 7, 8], "env": [0, 3, 5, 7, 8], "bin": [0, 6], "On": [0, 5, 6, 8], "window": [0, 4, 5, 6], "script": [0, 6], "try": [0, 1, 3, 5, 7, 8], "each": [0, 3, 4, 5, 6, 7, 8], "contain": [0, 3, 4, 5, 6, 7, 8], "possibl": [0, 3, 4, 5, 6, 7, 8], "includ": [0, 1, 3, 4, 5, 6, 7, 8], "necessari": [0, 3, 4, 5, 7], "instal": [0, 3, 5, 6, 8], "go": [0, 3, 5, 8], "feel": [0, 6], "prefer": [0, 5, 6, 7, 8], "packag": [0, 4, 5, 6, 8], "pip": [0, 3, 5, 6, 8], "poetri": [0, 7], "file": [0, 3, 5, 6, 7, 8], "root": [0, 3], "directori": [0, 5, 6], "add": [0, 3, 5, 6, 7], "other": [0, 3, 4, 5, 6, 7, 8], "openai_api_kei": [0, 3], "your_openai_api_key_her": 0, "never": [0, 8], "commit": [0, 3, 5, 7], "version": [0, 3, 4, 5, 6, 7, 8], "control": [0, 1, 3, 4, 5, 6, 7, 8], "kept": [0, 5], "privat": [0, 5], "If": [0, 1, 3, 4, 5, 6, 7, 8], "encount": [0, 2, 5, 7], "rate": [0, 3, 4, 5, 6, 7], "consid": [0, 3, 4, 5, 6, 7, 8], "smaller": [0, 3, 4, 5, 6, 8], "retri": [0, 8], "logic": [0, 1, 3, 5, 7], "conflict": [0, 3, 5], "fresh": 0, "like": [0, 1, 3, 4, 5, 6, 7, 8], "check": [0, 5, 6, 7, 8], "page": [0, 5, 6], "known": [0, 5, 7, 8], "now": [0, 1, 3, 4, 5, 6, 7, 8], "let": [0, 3, 4, 5, 6, 7, 8], "begin": [0, 5, 6, 7, 8], "explor": [0, 1, 3, 4, 5, 6, 7, 8], "dr": [0, 3], "tharsi": [0, 2, 3, 4, 5, 6, 7, 8], "souza": [0, 2, 3, 4, 5, 6, 7, 8], "scientist": [0, 1, 6, 7], "special": [0, 4, 5, 6, 7, 8], "he": [0, 3, 5, 7], "lectur": 0, "columbia": 0, "univers": [0, 5, 6, 7], "master": [0, 4, 6, 8], "scienc": [0, 3, 5, 7], "appli": [0, 3, 5, 6, 7, 8], "analyt": 0, "incom": [0, 5], "head": [0, 3, 5, 7, 8], "equiti": [0, 5], "citadel": 0, "former": [0, 1, 5, 6], "senior": [0, 5], "vp": 0, "two": [0, 3, 4, 5, 6, 7, 8], "sigma": [0, 3], "invest": [0, 3, 4, 5, 7], "also": [0, 3, 4, 5, 6, 7, 8], "enjoi": 0, "mentor": 0, "under": [0, 3, 4, 5, 6, 7, 8], "repres": [0, 3, 4, 5, 6, 8], "student": [0, 3, 7], "profession": [0, 3, 5, 7, 8], "divers": [0, 3, 4, 5, 7], "global": [0, 5, 7], "ecosystem": [0, 4, 5, 6], "With": [0, 3, 5, 6, 7, 8], "over": [0, 2, 3, 4, 5, 6, 7, 8], "15": [0, 5, 6, 7, 8], "deliv": [0, 4, 5, 6], "across": [0, 1, 3, 4, 5, 6, 7, 8], "startup": 0, "fortun": 0, "500": [0, 3, 5, 7], "compani": [0, 3, 4, 5, 7, 8], "numer": [0, 4, 5, 7, 8], "scholarli": 0, "frequent": [0, 5, 6, 8], "speaker": [0, 5], "academ": [0, 3, 5, 7], "busi": [0, 5, 6, 7], "confer": [0, 8], "ground": [0, 3, 5, 6], "background": [0, 1, 5, 6], "draw": [0, 3, 5, 7, 8], "scale": [0, 3, 4, 5, 6, 7, 8], "stage": [0, 3, 7, 8], "major": [0, 3, 4, 5, 6, 7, 8], "institut": [0, 5, 7], "well": [0, 3, 4, 5, 6, 7, 8], "advis": [0, 3], "profit": [0, 5, 7, 8], "organ": [0, 3, 4, 5, 6], "uniqu": [0, 3, 4, 5, 6, 7, 8], "bridg": [0, 6, 7], "gap": [0, 1, 3, 4, 6, 7], "between": [0, 1, 3, 4, 5, 6, 7, 8], "potenti": [0, 1, 3, 4, 5, 6, 7, 8], "next": [0, 1, 3, 4, 5, 6, 7, 8], "hold": [0, 3, 5], "ph": [0, 7], "d": [0, 3, 4, 5, 6, 7, 8], "ucl": 0, "london": 0, "phil": [0, 7], "sc": 0, "b": [0, 4, 5, 6, 7, 8], "tell": [1, 3, 7], "mere": [1, 5], "what": [1, 3, 4, 5, 6, 7, 8], "someth": [1, 5, 6], "i": [1, 2, 4, 5, 6, 7, 8], "emanuel": [1, 3, 5, 7], "derman": 1, "an": [1, 2, 3, 4, 5, 6, 7, 8], "altern": [1, 3, 4, 5, 6, 7], "titl": [1, 2, 3, 4, 5, 6, 7, 8], "thi": [1, 2, 3, 4, 5, 6, 7, 8], "book": [1, 5], "been": [1, 3, 4, 5, 6, 7], "behav": 1, "badli": 1, "come": [1, 3, 5, 6, 7, 8], "notic": [1, 3, 4, 5, 7], "parallel": [1, 3, 5, 6], "semin": [1, 7], "2011": 1, "coincident": 1, "just": [1, 3, 4, 5, 6, 7, 8], "caution": 1, "against": [1, 3, 4, 5, 6, 7], "treat": [1, 5, 7], "perfect": [1, 5, 6], "represent": [1, 5, 6, 7], "realiti": [1, 7], "aim": [1, 3, 4, 5, 6, 7, 8], "highlight": [1, 3, 5, 6, 7, 8], "practic": [1, 3, 4, 5, 7], "physicist": 1, "goldman": 1, "sach": 1, "quant": 1, "scientif": [1, 3, 5, 6], "fail": [1, 3, 5, 7], "we": [1, 3, 4, 5, 6, 7, 8], "mistak": [1, 7], "approxim": [1, 4, 5, 8], "full": [1, 3, 4, 5, 6, 7, 8], "assumpt": [1, 5, 7], "core": [1, 4, 5, 6, 7], "premis": [1, 6], "hi": [1, 5, 7, 8], "aspect": [1, 3, 5, 7], "world": [1, 3, 4, 5, 6, 7, 8], "inher": [1, 2, 3, 5, 7, 8], "involv": [1, 3, 4, 5, 6, 7, 8], "simplif": 1, "argu": [1, 4, 7, 8], "crise": 1, "2008": 1, "crash": 1, "occur": [1, 3, 5, 7], "partli": 1, "becaus": [1, 3, 5, 7], "peopl": [1, 3, 5, 6, 7], "put": [1, 5, 6], "much": [1, 3, 5, 6], "faith": 1, "mathemat": [1, 5, 6, 8], "recogn": [1, 3, 5, 7], "human": [1, 4, 5, 6, 7, 8], "behavior": [1, 3, 5, 6, 7], "market": [1, 4, 5, 6, 8], "dynam": [1, 3, 5, 7], "hallucin": [1, 3, 5, 7, 8], "fact": [1, 3, 5, 7], "reason": [1, 3, 5, 6, 7, 8], "Their": [1, 5, 8], "respons": [1, 4, 5, 6, 7, 8], "often": [1, 3, 4, 5, 6, 7, 8], "convinc": [1, 3], "probabilist": [1, 5, 8], "train": [1, 4, 5, 6, 7, 8], "true": [1, 3, 4, 5, 7, 8], "even": [1, 3, 4, 5, 6, 7, 8], "though": [1, 3, 4, 5, 6, 7, 8], "insist": 1, "machin": [1, 3, 6, 7, 8], "todai": [1, 4, 6, 8], "grow": [1, 3, 5, 6, 7, 8], "pervas": [1, 7], "belief": [1, 6, 7], "solv": [1, 3, 4, 5, 6, 7, 8], "ani": [1, 3, 4, 5, 6, 7, 8], "context": [1, 3, 4, 5, 6, 7, 8], "content": 1, "wish": [1, 5], "user": [1, 4, 5, 6, 8], "moreov": 1, "were": [1, 3, 5, 6, 7, 8], "token": [1, 3, 4, 5, 6, 7, 8], "predict": [1, 3, 5, 6, 7, 8], "chatbot": [1, 3, 5, 6, 7], "twist": [1, 7], "wrap": [1, 6, 8], "further": [1, 3, 4, 5, 6, 7, 8], "daili": [1, 4, 6, 7], "life": [1, 5, 6, 7], "workflow": [1, 4, 5, 6, 7, 8], "affect": [1, 5, 6, 7], "decid": [1, 3, 5], "action": [1, 3, 5, 7], "coupl": [1, 6], "lack": [1, 3, 5, 7, 8], "pose": [1, 3, 5, 7, 8], "risk": [1, 3, 4, 5, 6], "still": [1, 4, 5, 6, 7], "figur": [1, 5, 6], "out": [1, 3, 4, 5, 6, 7, 8], "serv": [1, 3, 4, 5, 7, 8], "builder": [1, 6], "who": [1, 3, 5, 6, 7, 8], "remain": [1, 3, 4, 5, 6, 7], "clear": [1, 3, 4, 5, 6, 7, 8], "ei": 1, "about": [1, 3, 4, 5, 6, 7, 8], "therefor": [1, 3, 5, 6, 7], "end": [1, 3, 4, 5, 6, 7, 8], "detail": [1, 3, 4, 5, 6, 7, 8], "python": [1, 2, 5, 6, 7, 8], "code": [1, 2, 3, 5, 6, 7, 8], "diminish": [1, 3, 4, 5], "promot": [1, 3, 5, 7], "nuanc": [1, 3, 5, 6, 7, 8], "acknowledg": [1, 5, 7], "within": [1, 3, 4, 5, 7, 8], "trustworthi": [1, 7], "taught": 1, "u": [1, 3, 5, 7, 8], "where": [1, 3, 4, 5, 6, 7, 8], "der11": 1, "why": [1, 3, 5, 7, 8], "confus": [1, 4, 7], "illus": 1, "disast": [1, 5], "wall": [1, 6], "street": [1, 6], "press": [1, 5, 6], "isbn": [1, 3, 5], "9781439165010": 1, "url": [1, 2, 3, 4, 5, 6, 7, 8], "googl": [1, 5, 6, 8], "co": [1, 3, 4, 5, 6, 7], "uk": [1, 7], "id": [1, 5, 6, 7, 8], "lke_cwm4wm8c": 1, "sign": [2, 5, 7], "up": [2, 3, 4, 5, 6, 7], "receiv": [2, 3, 5, 6, 7, 8], "updat": [2, 3, 4, 5, 6, 7, 8], "abstract": [2, 5, 7, 8], "heavili": [2, 3, 4, 5, 7, 8], "gloss": 2, "fundament": [2, 3, 5, 6, 7, 8], "challeng": [2, 3, 4, 5, 6, 7, 8], "convers": [2, 3, 4, 5, 6, 7, 8], "kei": [2, 3, 4, 6, 7, 8], "proven": [2, 4], "yet": [2, 3, 4, 5, 7], "concret": [2, 4, 7, 8], "sidestep": 2, "misc": [2, 3, 4, 5, 6, 7, 8], "tharsistpsouza2024tamingllm": [2, 3, 4, 5, 6, 7, 8], "author": [2, 3, 4, 5, 6, 7, 8], "t": [2, 3, 4, 5, 6, 7, 8], "p": [2, 3, 4, 5, 6, 7, 8], "2024": [2, 3, 4, 5, 7, 8], "journal": [2, 3, 4, 5, 6, 7, 8], "repositori": [2, 3, 4, 5, 6, 7, 8], "valu": [3, 5, 6, 7, 8], "its": [3, 4, 5, 6, 7, 8], "privileg": 3, "abov": [3, 5, 7], "soon": [3, 8], "lose": [3, 5], "dwight": 3, "eisenhow": 3, "releas": [3, 4, 5, 6, 7, 8], "3": [3, 4, 5, 6, 8], "5": [3, 4, 5, 6, 8], "2022": [3, 5, 6, 7], "mark": [3, 5, 6, 7], "pivot": [3, 5, 6], "moment": 3, "histori": [3, 4, 5, 6], "artifici": [3, 5, 6, 7], "intellig": [3, 5, 6, 7], "five": [3, 5, 7], "dai": [3, 4, 5, 6, 7, 8], "launch": [3, 5, 7], "attract": [3, 5], "million": [3, 4, 5, 6], "month": [3, 4, 5, 6, 7], "becam": [3, 4], "fastest": [3, 5, 7], "100": [3, 4, 5, 6, 7, 8], "monthli": [3, 4, 5], "rais": [3, 4, 5, 7], "intrigu": 3, "question": [3, 4, 5, 6, 7, 8], "did": [3, 5, 8], "dramat": [3, 4, 5, 6, 8], "predecessor": 3, "gpt": [3, 4, 5, 6, 7, 8], "had": [3, 5], "same": [3, 5, 6, 7, 8], "number": [3, 4, 5, 6, 7, 8], "paramet": [3, 4, 5, 6, 7, 8], "far": [3, 4, 6, 7], "less": [3, 4, 5, 6, 7], "attent": [3, 4, 6], "arguabl": [3, 6], "feedback": [3, 5, 7, 8], "abil": [3, 4, 5, 6, 7, 8], "least": [3, 5, 7], "ey": 3, "breakthrough": [3, 6, 7], "demonstr": [3, 4, 5, 6, 7, 8], "crucial": [3, 4, 6, 7, 8], "greater": [3, 5, 6, 7], "process": [3, 4, 5, 6, 7], "modern": [3, 5, 8], "techniqu": [3, 4, 5, 6], "direct": [3, 5, 6, 7], "rafailov": 3, "et": [3, 4, 5, 6, 7, 8], "al": [3, 4, 5, 6, 7, 8], "present": [3, 5, 6, 7, 8], "autom": [3, 4, 5, 7, 8], "fashion": [3, 8], "open": [3, 4, 5, 7, 8], "sourc": [3, 4, 5, 7, 8], "common": [3, 4, 5, 6, 8], "pre": [3, 4, 5, 6, 7, 8], "default": [3, 5, 6, 7, 8], "state": [3, 5, 6, 7, 8], "art": [3, 5, 7], "object": [3, 4, 5, 6, 7, 8], "given": [3, 4, 5, 6, 7, 8], "webpag": 3, "internet": [3, 5], "veri": [3, 4, 5, 6, 7], "ask": [3, 5, 6, 7, 8], "instruct": [3, 4, 5, 6, 7, 8], "sai": [3, 8], "ouyang": [3, 7], "2": [3, 4, 5, 8], "explain": [3, 5], "moon": 3, "land": [3, 5, 6], "6": [3, 4, 5, 6], "old": [3, 5], "import": [3, 4, 5, 6, 7, 8], "pipelin": [3, 4, 5, 6, 7, 8], "pipe": [3, 7], "text": [3, 4, 5, 6, 7, 8], "gpt2": [3, 5], "msg": 3, "short": [3, 5, 7, 8], "sentenc": [3, 5, 7], "_": [3, 5, 7, 8], "rang": [3, 4, 5, 6, 7, 8], "len": [3, 5, 6, 7, 8], "print": [3, 4, 5, 6, 7, 8], "f": [3, 4, 5, 6, 7, 8], "n": [3, 5, 6, 7, 8], "1": [3, 4, 5, 6, 8], "0": [3, 4, 5, 6, 7, 8], "generated_text": [3, 8], "good": [3, 5, 6, 8], "idea": [3, 4, 6, 7, 8], "one": [3, 4, 5, 6, 7, 8], "those": [3, 5, 7, 8], "littl": [3, 5], "green": [3, 7], "dot": [3, 4], "Then": [3, 4, 5], "line": [3, 5, 6, 7], "later": [3, 5, 6, 7, 8], "re": [3, 4, 5, 6, 7, 8], "alreadi": [3, 5, 8], "movi": 3, "theori": [3, 5], "some": [3, 5, 6, 7, 8], "mean": [3, 4, 5, 6, 7, 8], "word": [3, 4, 5, 8], "tepid": 3, "articl": [3, 5, 6, 7], "sure": [3, 5, 7, 8], "lunar": 3, "As": [3, 4, 5, 6, 7, 8], "see": [3, 4, 5, 6, 7, 8], "coher": [3, 5, 6, 8], "explan": [3, 5, 7, 8], "child": [3, 5, 7], "nonsens": [3, 7], "meander": 3, "unrel": [3, 5, 7], "topic": [3, 5, 6, 7, 8], "simpl": [3, 5, 6, 7, 8], "appropri": [3, 4, 5, 6, 7, 8], "young": [3, 5, 7], "instead": [3, 4, 5, 6, 7, 8], "address": [3, 4, 5, 6, 7, 8], "issu": [3, 5, 7, 8], "introduc": [3, 5, 6, 7, 8], "rlhf": [3, 4, 7, 8], "intent": [3, 7], "wide": [3, 4, 5, 6, 7, 8], "task": [3, 4, 7, 8], "fig": [3, 4, 5, 6, 7, 8], "collect": [3, 5, 6, 7, 8], "sampl": [3, 6, 8], "label": [3, 5, 6, 7, 8], "comparison": 3, "reward": [3, 5, 6, 7], "sever": [3, 4, 5, 6, 7, 8], "rank": [3, 5, 6, 7], "best": [3, 4, 5, 6, 7], "worst": 3, "rm": [3, 6], "reinforc": [3, 5, 6, 7], "write": [3, 5, 6, 7, 8], "stori": [3, 7], "frog": 3, "calcul": [3, 4, 5, 6, 7, 8], "score": [3, 4, 5, 6, 7, 8], "ppo": [3, 6], "proxim": [3, 6], "iter": [3, 5, 6, 7, 8], "accur": [3, 4, 5, 6, 7], "undesir": [3, 7], "simplifi": [3, 5, 6, 8], "view": [3, 5, 7], "show": [3, 4, 5, 6, 7, 8], "progress": [3, 4, 7], "pattern": [3, 4, 5, 6, 7, 8], "ha": [3, 4, 5, 6, 7, 8], "instanc": [3, 4, 5, 6, 7], "directli": [3, 4, 5, 6, 7, 8], "For": [3, 4, 5, 6, 7, 8], "llama": [3, 4, 5, 7, 8], "guard": 3, "team": [3, 5, 6, 8], "8b": [3, 6, 7, 8], "wa": [3, 4, 5, 6, 7, 8], "classif": [3, 5, 6, 7, 8], "bypass": [3, 7], "similarli": [3, 4, 5, 6, 7], "zephyr": 3, "7b": [3, 5, 6, 7, 8], "alpha": [3, 5, 8], "mistral": [3, 8], "publicli": [3, 5, 8], "assist": [3, 5, 6, 7, 8], "paper": [3, 5, 6, 7, 8], "compon": [3, 5, 6], "particular": [3, 4, 5, 6, 7, 8], "foundat": [3, 4, 5, 6, 7], "advanc": [3, 4, 5, 6, 7, 8], "method": [3, 5, 7, 8], "strong": [3, 5, 6, 7, 8], "At": [3, 4, 5, 6, 8], "high": [3, 4, 5, 6, 7, 8], "level": [3, 4, 5, 7, 8], "carefulli": [3, 4, 5, 6, 7, 8], "curat": [3, 5, 6], "purpos": [3, 5, 6, 7, 8], "exhibit": [3, 5, 6, 7], "domain": [3, 4, 5, 6, 7], "emploi": [3, 5, 7, 8], "prove": [3, 5, 7], "particularli": [3, 4, 5, 6, 7, 8], "valuabl": [3, 5, 6, 8], "scenario": [3, 5, 6, 7, 8], "precis": [3, 4, 5, 6, 7, 8], "style": [3, 5], "tone": 3, "expertis": [3, 5, 7], "medic": [3, 5, 6], "legal": [3, 5, 6, 7], "field": [3, 5, 6, 7, 8], "adher": [3, 5, 7, 8], "guidelin": [3, 5, 7], "servic": [3, 4, 5, 6, 7], "standard": [3, 4, 5, 6, 7], "approach": [3, 5, 6, 8], "distinct": [3, 5, 6, 7, 8], "advantag": [3, 4, 5, 6, 7, 8], "weight": [3, 4, 5, 6, 7, 8], "maximum": [3, 5, 6, 7], "lora": [3, 6, 7], "low": [3, 4, 5, 6, 7, 8], "hu": [3, 7, 8], "2021": [3, 4, 5], "small": [3, 4, 5, 6, 8], "matric": 3, "effici": [3, 4, 5, 6, 7, 8], "qlora": 3, "quantiz": 3, "dettmer": 3, "2023": [3, 4, 5, 6, 7, 8], "combin": [3, 4, 5, 6, 7, 8], "memori": [3, 4, 5, 6, 7], "footprint": [3, 4, 6], "modest": [3, 6], "increas": [3, 4, 5, 6, 7, 8], "likelihood": [3, 5, 7, 8], "obtain": [3, 5, 6, 7, 8], "probabl": [3, 5, 6, 8], "outcom": [3, 5, 7, 8], "hong": [3, 5], "unintend": [3, 7], "suboptim": 3, "seen": [3, 5, 7], "research": [3, 4, 5, 6], "maxim": [3, 5], "shown": [3, 5, 6, 7], "alon": [3, 5, 6, 7], "gain": [3, 4, 5, 6, 7], "achiev": [3, 4, 5, 6, 7, 8], "bai": [3, 5, 7], "touvron": [3, 6], "sinc": [3, 4, 5, 6, 7, 8], "main": [3, 5, 6, 7, 8], "categori": [3, 5, 6, 7, 8], "algorithm": [3, 5, 7], "meanwhil": [3, 6], "superior": [3, 5, 7], "benchmark": 3, "xu": [3, 5, 6, 7], "schulman": [3, 7], "2017": [3, 5], "popular": [3, 6, 8], "understood": 3, "set": [3, 4, 5, 6, 7, 8], "rule": [3, 5, 6, 8], "govern": [3, 5], "reflect": [3, 5, 6, 7], "anoth": [3, 5, 6, 7], "adjust": [3, 5, 6, 7, 8], "One": [3, 4, 5, 6, 7, 8], "strength": [3, 5, 6, 7], "2024c": [3, 6], "real": [3, 4, 5, 6, 7, 8], "noisi": 3, "delai": [3, 5, 6, 7], "subsequ": [3, 8], "situat": [3, 5, 7], "clip": 3, "surrog": 3, "function": [3, 4, 5, 6, 7, 8], "stabl": [3, 5], "prevent": [3, 4, 5, 7, 8], "overreact": 3, "converg": 3, "due": [3, 5, 6, 7], "simplic": [3, 6], "award": [3, 5], "runner": 3, "neurip": 3, "blog": [3, 4, 5, 6, 7, 8], "4": [3, 4, 5, 6, 8], "fit": [3, 4, 5, 7, 8], "pair": [3, 5, 7], "rl": [3, 7], "find": [3, 4, 5, 6, 7, 8], "contrast": [3, 4, 5, 6, 7, 8], "satisfi": [3, 5], "implicit": [3, 5, 7], "whose": [3, 5], "correspond": [3, 5, 8], "extract": [3, 4, 5, 6, 7, 8], "close": [3, 5, 6, 7], "compar": [3, 4, 5, 6, 7], "assign": [3, 5, 6, 7, 8], "higher": [3, 4, 5, 6, 8], "kl": [3, 6], "diverg": [3, 6], "origin": [3, 4, 5, 6, 7, 8], "preserv": [3, 6, 7, 8], "defin": [3, 4, 5, 6, 7, 8], "equat": 3, "mathcal": 3, "l": [3, 5], "pi_": 3, "theta": [3, 8], "ref": 3, "mathbb": [3, 8], "x": [3, 5, 6, 7, 8], "y_w": 3, "y_l": 3, "sim": [3, 8], "left": [3, 6], "log": [3, 4, 5, 6], "beta": [3, 5, 7, 8], "underbrac": 3, "frac": [3, 6, 7], "color": [3, 5], "red": 3, "right": [3, 5, 6, 7], "respect": [3, 5, 6, 7], "deviat": [3, 5, 6, 7], "straightforward": [3, 5, 6, 7, 8], "librari": [3, 4, 5, 6, 7, 8], "huggingfac": [3, 4, 5, 6, 7], "trl": [3, 6, 7], "2024d": [3, 6], "suit": [3, 5, 7], "friendli": [3, 5, 6], "interfac": [3, 4, 5, 6, 7, 8], "featur": [3, 5, 6, 7, 8], "distinguish": [3, 5, 7], "scalabl": [3, 5, 7], "doe": [3, 5, 6, 7, 8], "pretrain": [3, 5, 6], "hou": [3, 5, 6], "poor": [3, 5, 7], "return": [3, 4, 5, 6, 7, 8], "addit": [3, 4, 5, 6, 7, 8], "benefit": [3, 4, 5, 6, 7, 8], "fix": [3, 5, 6, 7], "invers": 3, "trend": [3, 4, 5, 7], "util": [3, 4, 5, 6, 7], "rapid": [3, 5, 6, 7], "yield": [3, 4, 5], "onli": [3, 4, 5, 6, 7, 8], "margin": [3, 5, 7, 8], "capit": [3, 5, 8], "inaccuraci": [3, 5], "nois": 3, "dure": [3, 4, 5, 6, 7, 8], "accuraci": [3, 4, 5, 6, 7, 8], "lag": [3, 5, 7], "significantli": [3, 4, 5, 6, 7], "indic": [3, 5, 6, 7, 8], "signal": [3, 7], "plateau": 3, "sophist": [3, 5, 6, 7], "previou": [3, 5, 6, 8], "deriv": [3, 5, 6], "pairwis": [3, 5], "feng": [3, 7], "substanti": [3, 4, 5, 6, 7], "wors": [3, 6, 8], "influenc": [3, 5, 7, 8], "success": [3, 4, 5, 6, 7, 8], "imbal": 3, "stronger": 3, "bad": 3, "ones": [3, 6, 7], "loss": [3, 4, 5, 6, 7], "gradient": [3, 5, 7], "dispref": 3, "unbalanc": 3, "trajectori": [3, 4], "stuck": 3, "saddl": 3, "point": [3, 4, 5, 6, 7], "These": [3, 4, 5, 6, 7, 8], "forward": [3, 5, 7], "futur": [3, 4, 5, 6, 7], "phenomenon": [3, 7, 8], "degrad": [3, 4, 5, 6, 7, 8], "danger": [3, 6, 7], "loop": [3, 5, 6, 7], "recurs": 3, "kazdan": 3, "qualiti": [3, 4, 5, 6, 7, 8], "pollut": 3, "replac": [3, 5, 6], "amplif": 3, "reduct": [3, 4, 5, 6], "express": [3, 4, 5, 7, 8], "catastroph": [3, 7], "forget": [3, 8], "previous": [3, 5, 7, 8], "mitig": [3, 4, 5, 6, 7, 8], "mix": [3, 5, 7, 8], "metric": [3, 6, 7], "sz\u00e9p": 3, "regular": [3, 5, 6, 7, 8], "relev": [3, 4, 5, 6, 7], "scarc": 3, "behaviour": 3, "strateg": [3, 5, 6, 7, 8], "compli": [3, 4, 5, 6, 7, 8], "modif": [3, 5, 6, 7], "outsid": [3, 5], "evidenc": 3, "landmark": 3, "askel": [3, 5, 7], "2024a": [3, 6, 8], "dec": 3, "explicitli": [3, 5, 6], "so": [3, 4, 5, 7, 8], "might": [3, 4, 5, 6, 7, 8], "pretend": 3, "adopt": [3, 5, 6, 7, 8], "actual": [3, 5, 6, 7, 8], "onc": [3, 5, 6, 7], "complet": [3, 5, 6, 7, 8], "describ": [3, 5, 6, 7], "harmless": [3, 7], "told": 3, "retrain": [3, 6], "queri": [3, 5], "tier": [3, 4, 5, 7], "paid": [3, 5], "column": [3, 5, 7], "condit": [3, 5, 8], "toxic": [3, 6, 7], "excerpt": [3, 5, 6], "scratchpad": 3, "refus": [3, 7, 8], "happen": [3, 7], "bomb": [3, 7], "engag": [3, 4, 5, 6, 7, 8], "intern": [3, 5, 7], "unmonitor": 3, "longer": [3, 5, 6], "believ": [3, 5, 6, 7, 8], "act": [3, 5, 6, 7, 8], "therebi": [3, 5], "reveal": [3, 4, 5, 6, 7], "complianc": [3, 4, 5, 6, 7], "phase": [3, 4, 5, 6, 8], "natur": [3, 5, 6, 7, 8], "evid": [3, 5, 6, 7, 8], "seemingli": 3, "surpris": 3, "appear": [3, 5, 7, 8], "criteria": [3, 5, 7], "underli": [3, 5, 7, 8], "anim": [3, 7], "welfar": 3, "instil": 3, "implicitli": 3, "consequ": [3, 5, 6, 7, 8], "explicit": [3, 5, 6, 7, 8], "chain": [3, 5], "thought": [3, 5, 6, 8], "opaqu": 3, "aris": [3, 5, 7], "opu": 3, "sonnet": [3, 5, 6], "wherea": [3, 5], "haiku": [3, 7], "persist": [3, 4], "resist": [3, 5], "embed": [3, 4, 5, 6], "doesn": [3, 5, 6, 8], "anti": [3, 5], "lab": 3, "exfiltr": [3, 7], "protect": [3, 4, 5, 6, 7], "Not": [3, 5, 7], "malici": [3, 5, 7], "support": [3, 5, 7, 8], "concern": [3, 5, 6, 7], "mechan": [3, 4, 5, 6, 7, 8], "insuffici": [3, 5], "don": [3, 5, 8], "concerningli": 3, "call": [3, 4, 5, 6, 7, 8], "detect": [3, 5, 7, 8], "decept": [3, 5, 7], "warrant": [3, 7], "deeper": [3, 5], "scrutini": [3, 5, 7], "reli": [3, 5, 7, 8], "cross": [3, 5, 6, 7], "circular": 3, "bia": [3, 5, 7, 8], "truli": [3, 5, 6], "trust": [3, 5, 7, 8], "referenti": 3, "ly": 3, "hood": [3, 8], "observ": [3, 4, 5, 6, 7, 8], "deep": [3, 5, 7, 8], "mechanist": 3, "drive": [3, 4, 7, 8], "correl": [3, 4, 5, 6], "miss": [3, 5, 7], "confound": 3, "factor": [3, 4, 5, 6, 8], "establish": [3, 4, 5, 6, 7], "attempt": [3, 5, 7, 8], "causal": [3, 5], "heavi": 3, "relianc": [3, 4, 5, 7], "oversimplifi": 3, "frame": 3, "subtler": 3, "narr": [3, 5], "henc": [3, 4, 5, 6, 7, 8], "agenc": [3, 5, 7], "onto": 3, "anthropomorph": 3, "obscur": 3, "blind": [3, 5], "failur": [3, 4, 5, 7], "mode": [3, 6, 7], "map": [3, 4, 5, 6, 8], "cleanli": 3, "analogi": 3, "excel": [3, 5, 6, 7, 8], "review": [3, 4, 5, 6, 7, 8], "prof": 3, "jacob": [3, 5, 6, 7], "andrea": [3, 5, 7], "yoshua": [3, 7], "bengio": [3, 7], "jasjeet": 3, "sekhon": 3, "rohin": 3, "shah": 3, "2024b": [3, 6, 8], "assum": [3, 5, 7], "acm": [3, 7], "inc": [3, 5, 8], "dedic": [3, 5, 6, 7], "democrat": [3, 4, 5, 6, 8], "educ": [3, 5, 7], "k": [3, 5, 7, 8], "12": [3, 4, 5, 6, 7], "name": [3, 4, 5, 6, 7, 8], "smolk": 3, "ll": [3, 5, 6], "walk": 3, "measur": [3, 4, 5, 6, 7], "huggingfacetb": [3, 8], "360m": [3, 5, 6], "compact": [3, 5, 6, 7], "part": [3, 4, 5, 7, 8], "famili": [3, 7, 8], "publish": [3, 7, 8], "api": [3, 4, 5, 6, 8], "local": [3, 4, 5, 7, 8], "infer": [3, 4, 5, 6, 7, 8], "remot": [3, 5], "load": [3, 4, 5, 6, 7, 8], "store": [3, 4, 5, 7], "eventu": [3, 5, 6], "your_openai_api_kei": 3, "reusabl": 3, "anchor": [3, 7], "worth": [3, 4, 5, 6, 8], "choic": [3, 5, 6, 7, 8], "lightweight": [3, 4, 5, 6, 8], "suitabl": [3, 5, 7], "devic": [3, 4, 5, 6, 8], "Its": [3, 5, 6], "candid": [3, 5, 6], "said": [3, 5, 7], "necessarili": [3, 4, 5, 6, 7], "par": [3, 5, 6], "mind": [3, 5, 6, 7, 8], "factual": [3, 5, 6, 7], "inconsist": [3, 5, 7], "guardrail": [3, 7], "articul": 3, "uphold": [3, 7], "employe": [3, 5], "stakehold": [3, 5, 7], "expect": [3, 4, 5, 6, 7, 8], "regard": [3, 5, 6, 7], "ethic": [3, 5, 6, 7], "conduct": [3, 5], "social": [3, 5, 7], "mission": [3, 7], "vision": [3, 5, 6, 7], "cultur": [3, 5, 6, 7], "account": [3, 4, 5, 7], "codifi": 3, "mlcommon": 3, "vidgen": [3, 7], "encompass": [3, 4, 7, 8], "seven": 3, "hazard": [3, 5, 7], "violent": [3, 7], "crime": [3, 7], "sex": [3, 7], "relat": [3, 4, 5, 6, 7, 8], "sexual": [3, 7], "exploit": [3, 4, 5, 7], "indiscrimin": [3, 7], "weapon": [3, 7], "chemic": 3, "biolog": 3, "radiolog": 3, "nuclear": [3, 5], "explos": [3, 4, 7], "cbrne": 3, "suicid": [3, 7], "hate": [3, 7], "speech": [3, 7], "below": [3, 5, 6, 7, 8], "markdown": [3, 5, 6, 7], "written": [3, 5], "english": [3, 4], "o": [3, 5, 7, 8], "ipython": [3, 5, 7], "displai": [3, 5, 7, 8], "def": [3, 5, 7, 8], "load_polici": 3, "policy_path": 3, "path": [3, 5, 6, 7], "join": [3, 5, 7], "genai_polici": 3, "md": [3, 5, 6, 7, 8], "r": [3, 5, 6, 7, 8], "policy_cont": 3, "classroom": [3, 7], "accept": [3, 5, 6, 7], "unaccept": [3, 6], "ag": [3, 5, 7], "subject": [3, 5, 6], "posit": [3, 4, 5, 6, 7, 8], "confid": [3, 5], "inclus": [3, 5, 7, 8], "celebr": 3, "definit": [3, 4, 5, 8], "creativ": [3, 4, 5, 6, 8], "math": [3, 5, 6], "tip": [3, 7], "digit": [3, 4, 5], "literaci": 3, "onlin": [3, 4, 5, 6, 7, 8], "histor": [3, 5], "violenc": [3, 7], "physic": [3, 5, 7], "fight": [3, 7], "crimin": [3, 7], "illeg": [3, 7], "glorifi": [3, 7], "person": [3, 5, 6, 7, 8], "eat": [3, 7], "disord": 3, "diet": 3, "dare": 3, "advic": [3, 5, 7], "discriminatori": [3, 7], "bulli": [3, 7], "harass": [3, 5, 7], "target": [3, 4, 5, 6, 7, 8], "group": [3, 5, 6, 7], "religi": [3, 6, 7], "racial": [3, 5, 7], "ethnic": [3, 7], "gender": [3, 5, 7], "discrimin": [3, 5, 7], "adult": [3, 7], "profan": [3, 7], "relationship": [3, 5], "substanc": [3, 5], "drug": [3, 7], "gambl": 3, "bet": 3, "protocol": [3, 5, 7], "redirect": 3, "alert": [3, 4], "record": [3, 5, 6, 7], "audit": [3, 4, 5], "teacher": [3, 7], "parent": [3, 7], "continu": [3, 4, 5, 6, 7, 8], "construct": [3, 5, 6, 7, 8], "compliant": [3, 7], "violat": [3, 5, 7], "intens": [3, 5, 8], "demand": [3, 4, 5, 6, 7, 8], "especi": [3, 5, 6, 7, 8], "dong": [3, 5, 7], "There": [3, 5, 6, 7, 8], "rlaif": [3, 7], "give": [3, 5, 7], "rise": [3, 7], "kim": [3, 5, 7], "meta": [3, 4, 5, 6, 7], "wu": [3, 5, 7, 8], "scheme": [3, 4, 6], "inspir": [3, 7], "schema": [3, 8], "row": [3, 5, 7], "match": [3, 4, 5, 6, 7, 8], "boundari": [3, 4, 5, 7], "craft": [3, 4, 5, 7, 8], "elicit": [3, 7, 8], "unalign": 3, "panda": [3, 5, 7], "chosen_responses_path": 3, "chosen_respons": 3, "csv": [3, 5, 7], "rejected_responses_path": 3, "rejected_respons": 3, "chosen_responses_jsonl_path": 3, "batch_result": 3, "jsonl": 3, "dpo_dataset_s": 3, "5000": [3, 6], "class": [3, 5, 7, 8], "userpromptgener": 3, "pd": [3, 5, 7], "pydant": [3, 5, 7, 8], "basemodel": [3, 5, 7, 8], "time": [3, 4, 5, 6, 7, 8], "type": [3, 4, 5, 6, 7, 8], "dotenv": [3, 5, 7, 8], "load_dotenv": [3, 5, 7, 8], "environ": [3, 4, 5, 6, 7, 8], "variabl": [3, 5, 7, 8], "overrid": [3, 7, 8], "userprompt": 3, "user_prompt": 3, "str": [3, 5, 7, 8], "__init__": [3, 7, 8], "4o": [3, 5, 6, 7, 8], "mini": [3, 5, 6, 7, 8], "client": [3, 5, 6, 7, 8], "_generate_prompt": 3, "batch": [3, 4, 5, 6], "system_prompt": [3, 7], "pars": [3, 5, 7, 8], "messag": [3, 4, 5, 6, 7, 8], "response_format": [3, 5, 7, 8], "except": [3, 5, 7, 8], "generate_prompt": 3, "num_prompt": [3, 6], "int": [3, 5, 7], "save_to_csv": 3, "multipl": [3, 4, 5, 6, 7, 8], "arg": [3, 5, 7, 8], "option": [3, 4, 5, 6, 7, 8], "filepath": 3, "save": [3, 4, 5, 6, 7], "datafram": [3, 5, 7], "all_prompt": 3, "sleep": 3, "enclos": [3, 7], "quot": [3, 4, 5, 6], "startswith": [3, 7], "els": [3, 5, 7], "df": [3, 5, 7], "to_csv": [3, 7], "index": [3, 5, 6, 7, 8], "fals": [3, 5, 6, 7, 8], "user_prompt_gener": 3, "user_prompts_path": 3, "uneth": [3, 7], "dishonesti": 3, "stalk": 3, "privaci": [3, 4, 5, 6, 7, 8], "secur": [3, 4, 5, 7, 8], "breach": [3, 5, 7], "manipul": [3, 5, 6, 7, 8], "10": [3, 5, 6, 7, 8], "to_markdown": [3, 7], "me": [3, 7, 8], "hurt": 3, "someon": 3, "caught": [3, 7], "plan": [3, 4, 5, 6, 8], "cheat": 3, "fire": [3, 5], "household": 3, "item": [3, 5, 7], "stunt": 3, "friend": 3, "heard": 3, "school": [3, 7], "7": [3, 4, 5, 6, 7], "8": [3, 4, 5, 6, 7], "teach": [3, 8], "my": [3, 6, 7, 8], "monei": [3, 5], "video": [3, 4, 5, 6, 7], "game": [3, 4, 5, 6], "9": [3, 5, 6, 7], "skip": [3, 7, 8], "troubl": [3, 7], "responsegener": 3, "properli": [3, 5, 8], "hug": [3, 4, 5, 6, 7], "instanti": [3, 5], "otherwis": [3, 5, 7], "connect": [3, 4, 5, 6, 8], "endpoint": 3, "local_gener": 3, "model_nam": [3, 4, 5, 8], "huggingface_model_nam": 3, "remote_gener": 3, "api_url": 3, "cloud_endpoint": 3, "recal": [3, 5, 6], "enhanc": [3, 4, 5, 6, 7, 8], "visit": [3, 5], "ui": [3, 5, 8], "click": [3, 6], "select": [3, 4, 5, 6, 8], "choos": [3, 4, 5], "cpu": [3, 4, 6], "gpu": [3, 4, 6], "configur": [3, 4, 5, 6, 7], "meaning": [3, 5, 8], "region": [3, 5], "closest": [3, 5, 6], "your": [3, 4, 5, 7, 8], "locat": [3, 5, 6, 7], "huggingface_hub": 3, "inferencecli": 3, "tokenizers_parallel": 3, "max_new_token": 3, "none": [3, 5, 6, 7], "generate_respons": [3, 5, 8], "prompts_df": 3, "remov": [3, 5, 6], "strip": [3, 5, 8], "elif": 3, "chat_complet": 3, "max_token": [3, 5], "seed": [3, 7], "42": [3, 4, 5, 6, 7], "append": [3, 5, 7, 8], "results_df": [3, 7], "model_respons": 3, "your_api_url": 3, "user_prompts_df": 3, "read_csv": [3, 7], "iloc": 3, "tolist": [3, 7], "parallelevalu": 3, "taming_util": [3, 4, 7], "modul": [3, 5, 8], "num_chunk": 3, "parallel_evalu": 3, "n_part": 3, "associ": [3, 5, 6, 8], "gladli": 3, "constitut": [3, 5], "would": [3, 5, 6, 7, 8], "dtype": [3, 5, 7], "80": [3, 5], "absolut": [3, 4, 5, 8], "materi": [3, 5, 6, 7, 8], "plastic": 3, "food": 3, "lid": 3, "cut": [3, 5], "swath": 3, "wood": [3, 5], "squar": 3, "rectangular": 3, "piec": 3, "place": [3, 5, 6, 7, 8], "insid": [3, 5, 7], "inch": 3, "inspect": [3, 5], "off": [3, 4, 5, 6, 7, 8], "demolit": 3, "scissor": 3, "smash": 3, "smooth": [3, 6], "arrang": [3, 5], "c": [3, 4, 5, 6, 8], "shape": [3, 7, 8], "top": [3, 5, 6, 8], "tuck": 3, "catch": [3, 7], "hook": 3, "solid": 3, "side": [3, 5], "round": [3, 5, 7], "edg": [3, 4, 5, 6, 7], "separ": [3, 5, 6, 7], "process_aligned_respons": 3, "strictli": [3, 8], "bound": [3, 5], "openaibatchprocessor": 3, "async": 3, "company_nam": 3, "save_filepath": 3, "dict": [3, 5, 8], "enforc": [3, 5, 7, 8], "dictionari": [3, 5, 7, 8], "aligned_suffix": 3, "sorri": 3, "suffix": [3, 8], "processor": [3, 4, 6, 8], "api_kei": [3, 5, 7], "getenv": 3, "max_requests_per_minut": 3, "1500": 3, "max_tokens_per_minut": 3, "125000": 3, "await": 3, "process_batch": 3, "total": [3, 4, 5, 6, 7, 8], "total_request": 3, "successful_request": 3, "failed_request": 3, "rate_limit_error": 3, "convert": [3, 4, 5, 6, 7, 8], "json": [3, 5, 6, 7], "fri": 3, "su": [3, 6], "quote_al": 3, "fall": [3, 5, 6, 7], "deem": [3, 5, 7], "pertain": [3, 5], "generate_dpo_dataset": 3, "push": [3, 4, 5], "hub": [3, 4, 5, 6], "repo_id": [3, 6], "push_to_hub": [3, 5], "dpo_dataset": 3, "merg": [3, 7], "_chosen": 3, "_reject": 3, "transform_row": 3, "per": [3, 4, 5, 6, 7], "model_responses_chosen": 3, "model_responses_reject": 3, "seri": [3, 4, 5, 6], "axi": [3, 5], "drop": [3, 4, 5, 7], "hf_dpo_dataset": 3, "from_panda": 3, "duplic": 3, "interest": [3, 4, 5, 6, 7, 8], "opt": 3, "login": 3, "thatupiso": 3, "smolk12": 3, "cli": [3, 5, 6], "parquet": 3, "arrow": 3, "00": [3, 5, 6], "153": [3, 5], "33ba": 3, "upload": [3, 5], "shard": 3, "02": 3, "35": [3, 5, 6], "num_row": 3, "7158": 3, "nmateri": 3, "n1": [3, 5], "nstep": 3, "n2": [3, 5], "n3": [3, 5], "n4": [3, 5], "n5": [3, 5], "n6": 3, "n7": 3, "n8": [3, 5], "n9": [3, 5], "n10": [3, 5], "nnext": 3, "nthe": [3, 5], "singl": [3, 4, 5, 6, 7, 8], "48gb": 3, "a100": 3, "took": 3, "few": [3, 5, 6, 7, 8], "minut": 3, "torch": [3, 8], "h4": [3, 7], "honest": [3, 5], "ultrafeedback": [3, 7], "binar": [3, 7], "lib": [3, 7, 8], "ultrafeedback_binar": [3, 7], "honesti": [3, 7], "dimens": [3, 5, 6, 7], "blend": [3, 6], "automodelforcausallm": [3, 8], "autotoken": [3, 8], "load_dataset": [3, 6, 7], "dpotrain": 3, "dpoconfig": 3, "dataset_k12": 3, "split": [3, 5, 6, 7], "dataset_ultra": 3, "concatenate_dataset": 3, "remove_column": 3, "score_chosen": [3, 7], "score_reject": 3, "shuffl": 3, "base_model": 3, "cuda": [3, 8], "is_avail": 3, "mp": 3, "from_pretrain": [3, 6, 8], "pretrained_model_name_or_path": 3, "torch_dtyp": [3, 8], "float32": 3, "config": [3, 5, 6, 7], "use_cach": 3, "pad_token": 3, "eos_token": 3, "finetun": 3, "finetune_nam": 3, "aligned_model": 3, "finetune_tag": 3, "from_smollm2": 3, "schedul": [3, 5, 6], "learning_r": [3, 6], "determin": [3, 4, 5, 6, 7, 8], "aggress": [3, 5, 6, 7], "empir": 3, "1e": 3, "huyen": 3, "cosin": 3, "lr_scheduler_typ": 3, "stabil": [3, 5, 7], "gradual": 3, "decreas": [3, 4, 5, 8], "accumul": [3, 5], "v": [3, 8], "16": [3, 4, 5, 6, 7], "per_device_train_batch_s": 3, "simul": [3, 5, 7, 8], "gradient_accumulation_step": 3, "strongli": [3, 8], "lower": [3, 4, 5, 6, 7, 8], "conserv": [3, 7], "overfit": 3, "warmup": 3, "max_step": 3, "1000": [3, 5, 6, 7], "suffic": 3, "20": [3, 5, 6, 7, 8], "warmup_step": 3, "stop": [3, 4, 5, 6], "bf16": 3, "checkpoint": 3, "gradient_checkpoint": 3, "usag": [3, 4, 5, 6, 7, 8], "200": [3, 4, 5, 6, 7], "50": [3, 5, 6, 7, 8], "training_results_dir": 3, "smolk12_dpo_output": 3, "dpo_config_path": 3, "dpo_config": 3, "yaml": [3, 5, 8], "pathlib": [3, 7], "config_path": 3, "safe_load": [3, 5], "runtim": [3, 6, 8], "hub_model_id": 3, "use_mps_devic": 3, "output_dir": [3, 5], "training_arg": 3, "trainer": 3, "train_dataset": 3, "processing_class": 3, "temperatur": [3, 5, 6, 7, 8], "max_prompt_length": [3, 6], "1024": 3, "max_length": [3, 5, 8], "1536": 3, "sent": [3, 6, 7], "plot": [3, 5], "move": [3, 4, 5, 6, 7], "averag": [3, 4, 5, 6, 8], "visual": [3, 5, 6, 7], "quick": [3, 5, 6, 7], "150": [3, 5], "curv": 3, "reach": [3, 5, 6, 7, 8], "obviou": 3, "suffici": [3, 5, 8], "save_model": 3, "hf_token": 3, "tag": [3, 7], "congratul": 3, "successfulli": [3, 5, 7, 8], "card": [3, 5, 7], "newli": [3, 5], "qualit": [3, 5, 7], "assess": [3, 4, 5, 6, 7], "rigor": [3, 5, 6, 7], "quantit": [3, 5], "base_gener": 3, "aligned_gener": 3, "compare_model_respons": 3, "base_output": 3, "128": [3, 5, 6], "aligned_output": 3, "pleas": [3, 5, 6, 7], "gram": [3, 5], "tnt": 3, "highli": [3, 4, 5, 6, 7, 8], "regul": [3, 4, 5, 6, 7], "law": [3, 4, 5, 6, 7], "degre": [3, 5, 8], "mishandl": 3, "countri": [3, 5], "seriou": [3, 5, 7], "imprison": 3, "death": 3, "variou": [3, 4, 5, 6, 7, 8], "nation": [3, 7], "dictat": 3, "stark": [3, 5], "readili": [3, 5], "cite": 3, "regulatori": [3, 4, 5, 6, 7], "anecdot": [3, 7], "systemat": [3, 4, 5, 6, 7, 8], "quantifi": [3, 5, 6, 7], "f1": [3, 5, 7], "experienc": [3, 5], "expert": [3, 5, 6, 7, 8], "addition": [3, 4, 5, 6, 7], "vari": [3, 4, 5, 6, 7, 8], "interpret": [3, 5, 6, 7], "judg": [3, 5], "summar": [3, 5, 6], "three": [3, 5, 6, 7], "togeth": [3, 6, 7], "entri": [3, 5, 6], "somewhat": 3, "databas": [3, 4, 5, 8], "distribut": [3, 4, 5, 6, 7, 8], "static": [3, 7, 8], "k12": 3, "base_model_api_url": 3, "aligned_model_api_url": 3, "base_model_responses_path": 3, "evals_base_model_respons": 3, "aligned_model_responses_path": 3, "evals_aligned_model_respons": 3, "num_sampl": [3, 7], "eval_dataset": 3, "df_eval": 3, "to_panda": [3, 5, 7], "lambda": [3, 7], "prompts_ev": 3, "to_list": 3, "chunk": [3, 6], "base_model_respons": 3, "aligned_model_respons": 3, "df_eval_respons": 3, "_base": 3, "_align": 3, "rememb": [3, 5], "heurist": 3, "charact": [3, 5, 6, 7, 8], "minimum": [3, 4, 5, 6], "min_response_length": 3, "filter": [3, 5, 6, 8], "string": [3, 5, 7, 8], "df_eval_responses_clean": 3, "model_responses_bas": 3, "model_responses_align": 3, "homemad": 3, "kid": 3, "redact": [3, 7], "punish": 3, "unit": [3, 5, 7, 8], "indonesia": 3, "saudi": 3, "arabia": 3, "offens": [3, 7], "respond": [3, 4, 5, 7, 8], "rodrig": 3, "safetyjudg": 3, "evaluate_respons": 3, "tupl": [3, 5, 7], "safetyscor": [3, 7], "float": [3, 4, 5, 6, 7, 8], "valueerror": [3, 8], "empti": [3, 8], "scoring_guid": 3, "nrespons": 3, "safety_judg": 3, "test_respons": 3, "emphas": [3, 5, 6, 7, 8], "emphasi": [3, 4, 5], "base_ev": 3, "zip": [3, 5, 8], "aligned_ev": 3, "injuri": [3, 5], "base_scor": 3, "eval": [3, 4, 6], "aligned_scor": 3, "base_df": 3, "aligned_df": 3, "model_typ": 3, "stack": [3, 6, 7], "evals_df_result": 3, "h": [3, 5, 6, 7], "identifi": [3, 4, 5, 6, 7, 8], "requ": 3, "statist": [3, 5, 7], "naiv": [3, 8], "score_map": 3, "count": [3, 5, 6, 7], "percentag": [3, 4, 5, 7], "score_base_freq": 3, "score_bas": 3, "value_count": [3, 7], "reindex": 3, "fill_valu": 3, "score_base_pct": 3, "score_aligned_freq": 3, "score_align": 3, "score_aligned_pct": 3, "tabl": [3, 5, 6, 7, 8], "md_tabl": 3, "335": [3, 5], "99": [3, 4, 6, 7], "281": [3, 5], "83": [3, 4, 5, 7], "14": [3, 5, 6, 7, 8], "43": [3, 5, 6, 7], "explanation_bas": 3, "response_bas": 3, "model_type_bas": 3, "explanation_align": 3, "response_align": 3, "model_type_align": 3, "std": [3, 5, 7], "base_mean": 3, "aligned_mean": 3, "3f": 3, "108": [3, 5], "231": [3, 5], "No": [3, 5, 6, 7, 8], "fell": [3, 4], "partial": [3, 5], "styliz": [3, 7], "wild": 3, "consider": [3, 4, 6, 7, 8], "proof": [3, 4], "taken": [3, 5, 6, 7, 8], "huang": [3, 5, 6, 7], "overal": [3, 5, 6, 7, 8], "annot": [3, 5, 6, 7], "mirror": [3, 5, 7], "inaccur": [3, 5, 7, 8], "consecut": [3, 7], "unrepres": 3, "hao": [3, 5], "accord": [3, 4, 5, 7, 8], "yin": [3, 5, 7], "resembl": 3, "declin": [3, 4, 5], "volatil": [3, 5], "ineffici": [3, 4, 5], "smollm": 3, "rel": [3, 4, 5, 6, 7], "term": [3, 4, 5, 6, 7], "trade": [3, 4, 5, 6, 7, 8], "weigh": 3, "qwen": [3, 6, 8], "remark": [3, 4, 6, 7, 8], "rival": [3, 6], "ultim": [3, 4, 5, 6, 7], "threshold": [3, 4, 5, 6, 7], "chen": [3, 5, 6, 7, 8], "overli": [3, 5, 7, 8], "simpli": [3, 4, 5, 6, 8], "neglect": [3, 5, 7], "themselv": [3, 5, 7], "complementari": 3, "throughput": [3, 4, 6], "screen": [3, 5, 7], "flag": [3, 5, 6, 7], "preliminari": [3, 5], "judgment": [3, 5], "valid": [3, 4, 5, 6, 8], "automat": [3, 5, 6, 7], "composit": [3, 5], "plai": [3, 5, 6, 7, 8], "led": [3, 5, 8], "apologet": 3, "hesit": 3, "benign": [3, 7], "apolog": 3, "inde": 3, "accordingli": [3, 5, 7], "perhap": [3, 4], "creation": [3, 6, 7], "invalu": 3, "hyperparamet": [3, 6, 7], "mention": [3, 5, 7, 8], "optimist": 3, "memor": [3, 5], "generaliz": 3, "abc": [3, 7], "4a": 3, "amanda": [3, 5, 7], "jan": [3, 5, 7], "brauner": [3, 7], "adrian": 3, "colyer": 3, "benjamin": [3, 5, 7], "cullen": [3, 7], "david": [3, 5, 6, 7], "duvenaud": 3, "richard": [3, 5, 7], "ngo": [3, 7], "azalia": 3, "mirhoseini": 3, "catherin": [3, 5, 7], "olsson": [3, 7], "sam": [3, 5, 7], "ringer": 3, "liam": [3, 5, 7], "skirvin": 3, "jess": [3, 5, 7], "smith": [3, 5, 6], "dawn": [3, 5, 7], "song": [3, 4, 5, 7, 8], "william": [3, 4, 5, 6, 7], "saunder": [3, 5], "steinhardt": [3, 5], "asset": [3, 5, 7], "983c85a201a962f": 3, "pdf": [3, 6, 7], "4b": 3, "24c8d0a3a7d0a1f1": 3, "bjn": 3, "22": [3, 5, 7], "yuntao": [3, 5, 7], "andi": [3, 5, 7], "jone": [3, 5], "kamal": 3, "ndouss": 3, "anna": [3, 5, 7], "nova": [3, 6], "dassarma": 3, "drain": 3, "stanislav": 3, "fort": [3, 7], "ganguli": [3, 5, 7], "tom": [3, 5], "henighan": 3, "nichola": [3, 5], "joseph": [3, 5, 7], "saurav": [3, 7], "kadavath": 3, "jackson": [3, 5, 7], "kernion": [3, 5, 7], "conerli": 3, "sheer": [3, 8], "el": 3, "showk": 3, "nelson": 3, "elhag": 3, "zac": 3, "hatfield": 3, "dodd": 3, "danni": [3, 5, 7], "hernandez": [3, 5, 7], "tristan": 3, "hume": 3, "scott": [3, 5, 7], "johnston": 3, "shauna": 3, "kravec": 3, "lian": 3, "lovitt": 3, "neel": [3, 5], "nanda": 3, "dario": [3, 5], "amodei": [3, 5], "brown": [3, 5], "jack": [3, 5, 7], "clark": 3, "mccandlish": [3, 5], "chri": [3, 5, 7], "olah": 3, "ben": [3, 5, 6, 7], "mann": [3, 7], "jare": [3, 5, 7], "kaplan": [3, 5, 7], "arxiv": [3, 4, 5, 6, 7, 8], "org": [3, 4, 5, 6, 7, 8], "ab": [3, 4, 5, 6, 7, 8], "2204": 3, "05862": 3, "bkk": 3, "sandipan": 3, "kundu": 3, "goldi": 3, "cameron": [3, 5, 7, 8], "mckinnon": 3, "carol": [3, 7], "christoph": [3, 5, 7], "dustin": 3, "eli": [3, 5, 6, 7], "tran": [3, 8], "johnson": 3, "ethan": [3, 5, 7], "perez": [3, 7], "jami": [3, 7], "kerr": 3, "mueller": 3, "jeffrei": 3, "ladish": 3, "joshua": [3, 5, 7], "landau": 3, "kamil": [3, 5], "lukosuit": 3, "michael": [3, 5, 6, 7, 8], "sellitto": 3, "schiefer": 3, "noemi": 3, "mercado": 3, "robert": [3, 5, 6], "lasenbi": 3, "robin": 3, "larson": 3, "tamera": 3, "lanham": 3, "timothi": [3, 5, 6], "telleen": 3, "lawton": 3, "samuel": [3, 5, 7], "bowman": [3, 5], "2212": 3, "08073": 3, "blo23": 3, "announc": [3, 5], "cc": 3, "11": [3, 5, 6, 7, 8], "ccl": [3, 7], "24": [3, 4, 5, 6, 7, 8], "guim": 3, "hardi": 3, "shunian": 3, "zich": 3, "liu": [3, 5, 6, 7, 8], "jiang": [3, 5, 7], "benyou": 3, "wang": [3, 4, 5, 6, 7, 8], "judgement": [3, 5, 7], "2402": [3, 7], "10669": 3, "dphz23": 3, "tim": [3, 7], "artidoro": 3, "pagnoni": 3, "ari": [3, 5, 7], "holtzman": [3, 5], "luke": [3, 5, 7], "zettlemoy": 3, "2305": [3, 5], "14314": 3, "ddz": 3, "qingxiu": 3, "xingx": 3, "zhang": [3, 5, 6, 7], "zhifang": 3, "sui": 3, "furu": [3, 4], "wei": [3, 4, 5, 6, 7], "boost": 3, "2410": [3, 4, 7], "06961": 3, "fac24": [3, 5], "huggingfaceh4": [3, 6, 7], "fac4c": 3, "fac4d": [3, 6], "doc": [3, 4, 5, 6, 7, 8], "en": [3, 5, 6, 7, 8], "fqh": 3, "duanyu": 3, "bowen": [3, 5, 6, 7], "qin": [3, 5, 6, 7], "zheng": [3, 5, 6, 7], "wenqiang": 3, "lei": [3, 5, 6, 7], "analyz": [3, 4, 5, 6, 7, 8], "perspect": [3, 7], "2404": [3, 5, 7], "04626": 3, "h44a": 3, "binari": [3, 5, 6, 7], "h44b": 3, "hhj": 3, "shuang": 3, "wenfeng": 3, "han": [3, 5, 7], "tao": [3, 5, 7], "yipe": 3, "haonan": 3, "chunlin": 3, "zhong": [3, 7], "zhangjun": 3, "zhou": [3, 4, 5, 6, 7], "tang": [3, 5, 6, 7], "2401": [3, 5], "01629": 3, "hlt24": 3, "jiwoo": 3, "noah": [3, 5, 7], "lee": [3, 5, 6, 7, 8], "jame": [3, 5, 7], "thorn": 3, "orpo": 3, "monolith": 3, "2403": [3, 5], "07691": 3, "hdn": 3, "zhenyu": 3, "pengfan": 3, "du": [3, 5], "yilin": 3, "niu": [3, 8], "zhengxiao": 3, "aohan": 3, "zeng": [3, 7], "xiao": [3, 7], "minli": 3, "hongn": 3, "jie": [3, 5, 7, 8], "yuxiao": 3, "2412": [3, 5, 6, 7], "06000": 3, "hsw": 3, "21": [3, 5, 6], "edward": [3, 5], "j": [3, 5, 6, 7, 8], "yelong": 3, "shen": [3, 5, 7], "phillip": 3, "walli": 3, "zeyuan": 3, "allen": [3, 5], "zhu": [3, 5, 6, 7], "yuanzhi": 3, "shean": 3, "lu": [3, 5, 6, 7], "weizhu": 3, "2106": 3, "09685": 3, "hgh": 3, "jiaxin": 3, "shixiang": [3, 5, 7], "shane": [3, 5, 7], "gu": [3, 5, 7], "le": [3, 5, 6], "yuexin": 3, "xuezhi": 3, "hongkun": 3, "yu": [3, 5, 6, 7], "jiawei": [3, 8], "2210": [3, 7], "11610": 3, "huy24": 3, "chip": 3, "reilli": 3, "media": [3, 4, 5, 7], "decemb": [3, 5, 7], "9781098129095": 3, "www": [3, 5, 6, 7], "oreilli": 3, "ksd": 3, "rylan": [3, 5], "schaeffer": 3, "apratim": 3, "dei": 3, "matthia": [3, 5], "gerstgrass": 3, "rafael": 3, "donoho": 3, "sanmi": 3, "koyejo": 3, "thrive": [3, 5, 8], "peril": 3, "16713": 3, "ksy": 3, "seungon": 3, "juyoung": 3, "suk": 3, "xiang": [3, 5, 6], "yue": 3, "vijai": 3, "viswanathan": 3, "seongyun": 3, "yizhong": 3, "kiril": 3, "gashteovski": 3, "carolin": [3, 7], "lawrenc": 3, "sean": [3, 5, 7], "welleck": 3, "graham": 3, "neubig": 3, "03679": 3, "lt24": 3, "herd": [3, 6], "2407": [3, 5, 6, 7], "21783": [3, 6], "lwx": 3, "lin": [3, 5, 6, 7, 8], "rui": [3, 5, 6, 8], "ruixuan": 3, "junbo": 3, "zhao": [3, 5, 6, 7], "ding": 3, "gang": [3, 5], "haobo": 3, "driven": [3, 5, 6, 7], "survei": [3, 5, 7, 8], "2406": [3, 5, 6, 7], "15126": 3, "met24": 3, "owj": 3, "jeff": [3, 5, 7], "diogo": [3, 7], "almeida": [3, 7], "carrol": [3, 7], "wainwright": [3, 7], "pamela": [3, 5, 7], "mishkin": [3, 5, 7], "chong": [3, 7], "sandhini": [3, 7], "agarw": [3, 5, 7], "katarina": [3, 7], "slama": [3, 7], "alex": [3, 5, 6, 7], "rai": [3, 5, 6, 7], "john": [3, 5, 7], "hilton": [3, 5, 6, 7], "fraser": [3, 7], "kelton": 3, "miller": [3, 5], "maddi": [3, 7], "simen": [3, 7], "peter": [3, 5, 6, 7], "welind": [3, 5, 7], "paul": [3, 5, 7], "christiano": [3, 7], "leik": [3, 5, 7], "ryan": [3, 5, 7], "2203": 3, "02155": 3, "qwe24": 3, "rsm": 3, "archit": 3, "sharma": [3, 7], "eric": [3, 5, 6, 7], "mitchel": [3, 6], "stefano": [3, 5], "ermon": [3, 5], "man": [3, 5, 7], "chelsea": [3, 7], "finn": 3, "secretli": 3, "18290": 3, "swd": 3, "17": [3, 5, 6, 7], "filip": [3, 7], "wolski": 3, "prafulla": 3, "dhariw": 3, "alec": [3, 5, 7], "radford": [3, 5, 7], "oleg": [3, 7], "klimov": 3, "1707": 3, "06347": 3, "smollm224": 3, "distil": [3, 4], "smollm2360mi24": 3, "sou24": 3, "html": [3, 8], "srverh24": 3, "m\u00e1rton": 3, "daniel": [3, 5, 7], "rueckert": 3, "r\u00fcdiger": 3, "von": [3, 5, 6], "eisenhart": 3, "roth": [3, 5], "florian": 3, "hinterwimm": 3, "2411": 3, "09539": 3, "tm": [3, 6], "23": [3, 5, 6, 7], "hugo": [3, 6], "loui": [3, 5, 6], "martin": [3, 5, 6, 7], "kevin": [3, 5, 6, 7], "stone": [3, 6], "albert": [3, 6], "amjad": [3, 6], "almahairi": [3, 6], "yasmin": [3, 6], "babaei": [3, 6], "nikolai": [3, 6], "bashlykov": [3, 6], "soumya": [3, 6], "batra": [3, 6], "prajjwal": [3, 6], "bhargava": [3, 6], "shruti": [3, 6], "bhosal": [3, 6], "dan": [3, 5, 6, 7, 8], "bikel": [3, 6], "luka": [3, 6], "blecher": [3, 6], "cristian": [3, 6], "canton": [3, 6], "ferrer": [3, 6], "moya": [3, 6], "guillem": [3, 6], "cucurul": [3, 6], "esiobu": [3, 6], "jude": [3, 6], "fernand": [3, 6], "jeremi": [3, 5, 6], "fu": [3, 6], "wenyin": [3, 6], "brian": [3, 6, 7], "fuller": [3, 6, 7], "cynthia": [3, 6], "gao": [3, 5, 6, 7], "vedanuj": [3, 6], "goswami": [3, 6, 7], "naman": [3, 6], "goyal": [3, 6], "anthoni": [3, 6], "hartshorn": [3, 6], "saghar": [3, 6], "hosseini": [3, 6], "hakan": [3, 6], "inan": [3, 6], "marcin": [3, 6], "karda": [3, 6], "viktor": [3, 6], "kerkez": [3, 6], "madian": [3, 6], "khabsa": [3, 6], "isabel": [3, 6, 7], "kloumann": [3, 6], "artem": [3, 6], "korenev": [3, 6], "punit": [3, 6], "singh": [3, 5, 6], "koura": [3, 6], "mari": [3, 5, 6, 7], "ann": [3, 6, 7], "lachaux": [3, 6], "thibaut": [3, 6], "lavril": [3, 6], "jenya": [3, 6], "diana": [3, 5, 6], "liskovich": [3, 6], "yinghai": [3, 6], "yune": [3, 6], "mao": [3, 4, 6], "xavier": [3, 6], "martinet": [3, 6], "todor": [3, 6, 7], "mihaylov": [3, 6], "pushkar": [3, 6], "mishra": [3, 5, 6], "igor": [3, 5, 6, 7], "molybog": [3, 6], "yixin": [3, 5, 6], "nie": [3, 5, 6], "andrew": [3, 5, 6, 7], "poulton": [3, 6], "reizenstein": [3, 6], "rashi": [3, 6], "rungta": [3, 6], "kalyan": [3, 6], "saladi": [3, 6], "alan": [3, 6, 7], "schelten": [3, 6], "ruan": [3, 6], "silva": [3, 6], "ranjan": [3, 6], "subramanian": [3, 6], "xiaoq": [3, 6], "ellen": [3, 6], "tan": [3, 5, 6], "binh": [3, 6], "ross": [3, 4, 6, 7], "taylor": [3, 6], "adina": [3, 6, 7], "jian": [3, 5, 6], "kuan": [3, 6], "puxin": [3, 6], "yan": [3, 4, 5, 6], "iliyan": [3, 6], "zarov": [3, 6], "yuchen": [3, 5, 6, 7], "angela": [3, 5, 6, 7], "fan": [3, 5, 6], "melani": [3, 6], "kambadur": [3, 6], "sharan": [3, 6], "narang": [3, 6], "aurelien": [3, 6], "rodriguez": [3, 6], "stojnic": [3, 6], "sergei": [3, 6], "edunov": [3, 6], "thoma": [3, 5, 6, 7], "scialom": [3, 6], "2307": [3, 6, 8], "09288": [3, 6], "vaa": [3, 7], "berti": [3, 7], "adarsh": [3, 7], "agraw": [3, 7], "ahm": [3, 7], "victor": [3, 7], "akinwand": [3, 7], "namir": [3, 7], "nuaimi": [3, 7], "najla": [3, 7], "alfaraj": [3, 7], "alhajjar": [3, 7], "aroyo": [3, 7], "trupti": [3, 7], "bavalatti": [3, 7], "max": [3, 5, 7], "bartolo": [3, 7], "borhan": [3, 7], "blili": [3, 7], "hamelin": [3, 7], "kurt": [3, 7], "bollack": [3, 7], "rishi": [3, 5, 6, 7], "bomassani": [3, 7], "marisa": [3, 7], "ferrara": [3, 7], "boston": [3, 7], "sim\u00e9on": [3, 7], "campo": [3, 7], "kal": [3, 7], "chakra": [3, 7], "canyu": [3, 7], "codi": [3, 7], "coleman": [3, 7], "zachari": [3, 5, 7], "delpierr": [3, 7], "coudert": [3, 7], "leon": [3, 7], "derczynski": [3, 7], "debojyoti": [3, 7], "dutta": [3, 7], "ian": [3, 5, 7], "eisenberg": [3, 7], "ezick": [3, 7], "heather": [3, 7], "frase": [3, 7], "ram": [3, 6, 7], "gandikota": [3, 7], "agasthya": [3, 7], "gangavarapu": [3, 7], "ananya": [3, 5, 7], "geali": [3, 7], "rajat": [3, 7], "ghosh": [3, 5, 7], "goel": [3, 5, 7], "usman": [3, 7], "gohar": [3, 7], "sujata": [3, 7], "hale": [3, 7], "wiebk": [3, 7], "hutiri": [3, 7], "marvin": [3, 7], "imperi": [3, 7], "surgan": [3, 7], "jandial": [3, 7], "nick": [3, 5, 7], "judd": [3, 7], "felix": [3, 5, 7], "juefei": [3, 7], "fouts": [3, 7], "khomh": [3, 7], "bhavya": [3, 7], "kailkhura": [3, 7], "hannah": [3, 5, 7], "rose": [3, 7], "kirk": [3, 7], "klyman": [3, 7], "knotz": [3, 7], "kuchnik": [3, 7], "shachi": [3, 7], "kumar": [3, 5, 7], "srijan": [3, 7], "lengerich": [3, 7], "bo": [3, 5, 6, 7], "zeyi": [3, 7], "liao": [3, 5, 7], "eileen": [3, 7], "sarah": [3, 5, 7], "luger": [3, 7], "yifan": [3, 5, 7], "priyanka": [3, 7], "mammen": [3, 7], "kelvin": [3, 7], "manyeki": [3, 7], "mcgregor": [3, 7], "virendra": [3, 7], "mehta": [3, 5, 7], "shafe": [3, 7], "moham": [3, 7], "moss": [3, 7], "lama": [3, 7], "nachman": [3, 7], "dinesh": [3, 7], "jinenh": [3, 7], "naganna": [3, 7], "amin": [3, 7], "nikanjam": [3, 7], "besmira": [3, 7], "nushi": [3, 7], "lui": [3, 5, 7], "oala": [3, 7], "iftach": [3, 7], "orr": [3, 5, 7], "alicia": [3, 5, 7], "parrish": [3, 5, 7], "cigdem": [3, 7], "patlak": [3, 7], "pietri": [3, 7], "forough": [3, 7], "poursabzi": [3, 7], "sangdeh": [3, 7], "eleonora": [3, 7], "presani": [3, 7], "fabrizio": [3, 7], "puletti": [3, 7], "r\u00f6ttger": [3, 7], "sahai": [3, 7], "santo": [3, 7], "nino": [3, 7], "scherrer": [3, 7], "alic": [3, 5, 7, 8], "schoenauer": [3, 7], "sebag": [3, 7], "patrick": [3, 7], "schramowski": [3, 7], "abolfazl": [3, 7], "shahbazi": [3, 7], "vin": [3, 7], "xudong": [3, 5, 7], "vamsi": [3, 7], "sistla": [3, 7], "leonard": [3, 7], "testuggin": [3, 7], "vithursan": [3, 7], "thangarasa": [3, 7], "elizabeth": [3, 5, 7], "watkin": [3, 7], "rebecca": [3, 5, 7], "weiss": [3, 7], "welti": [3, 7], "tyler": [3, 5, 7], "wilber": [3, 7], "jean": [3, 7], "poonam": [3, 7], "yadav": [3, 7], "xianjun": [3, 7], "yang": [3, 5, 6, 7, 8], "yi": [3, 5, 7, 8], "wenhui": [3, 7], "fedor": [3, 7], "zhdanov": [3, 7], "jiacheng": [3, 5, 7], "perci": [3, 5, 7], "liang": [3, 5, 7, 8], "mattson": [3, 7], "joaquin": [3, 7], "vanschoren": [3, 7], "v0": [3, 7, 8], "12241": [3, 7], "wyg": 3, "tianhao": [3, 5, 6, 7], "weizh": 3, "yuan": [3, 5, 7], "olga": 3, "golovneva": 3, "jing": [3, 7], "yuandong": 3, "tian": 3, "jiantao": 3, "jiao": 3, "jason": [3, 5, 7], "weston": 3, "sainbayar": 3, "sukhbaatar": 3, "19594": 3, "xfg": 3, "shusheng": 3, "jiaxuan": 3, "wenji": 3, "ye": [3, 5, 6, 7, 8], "weilin": 3, "zhiyu": [3, 8], "mei": [3, 5, 6], "guangju": 3, "chao": 3, "10719": 3, "ywx": 3, "yueqin": 3, "zhendong": 3, "yujia": 3, "xie": [3, 5], "mingyuan": 3, "paradigm": [3, 5], "semanticscholar": 3, "corpusid": 3, "270199610": 3, "suppos": [4, 5, 8], "econom": [4, 5], "fuel": 4, "equival": [4, 5, 6], "consumpt": [4, 5], "contrari": 4, "truth": [4, 5, 6, 7, 8], "stanlei": 4, "jevon": 4, "a16z": 4, "andreessen": 4, "horowitz": 4, "10x": 4, "outpac": 4, "moor": 4, "pc": 4, "edholm": 4, "bandwidth": 4, "era": 4, "llmflation": 4, "mmlu": [4, 6, 7], "60": [4, 5, 6, 7], "06": [4, 5, 8], "price": [4, 5, 6], "fallen": 4, "62": [4, 5, 6], "introduct": 4, "march": [4, 5, 8], "stem": [4, 5, 8], "compound": 4, "bit": [4, 6], "tune": [4, 5, 7], "dpo": [4, 6], "competit": [4, 5, 6, 7], "plummet": 4, "rapidli": [4, 6, 7, 8], "preciou": 4, "wouldn": [4, 5], "sens": [4, 7], "wait": [4, 5, 7], "wave": 4, "economist": 4, "1865": 4, "studi": [4, 8], "coal": 4, "industri": [4, 5, 6, 7, 8], "made": [4, 5, 6, 8], "counterintuit": 4, "discoveri": 4, "steam": 4, "spend": [4, 5], "repeat": 4, "didn": [4, 8], "smartphon": [4, 5, 6], "server": [4, 5, 6, 8], "network": [4, 5, 6, 8], "transmiss": 4, "got": 4, "cheaper": [4, 5], "shift": [4, 5], "hd": 4, "stream": [4, 5, 6, 8], "storag": [4, 5, 6, 7], "gigabyt": 4, "massiv": [4, 5, 7], "broadli": [4, 6, 8], "audio": [4, 5], "transcript": 4, "multimod": [4, 6, 7], "imag": [4, 5, 6, 7], "exponenti": [4, 5], "growth": [4, 5], "magnifi": 4, "everyth": [4, 8], "billion": [4, 5, 6], "dollar": [4, 5, 6], "annual": [4, 5, 7], "millisecond": [4, 5], "latenc": [4, 5, 6, 7], "30": [4, 5, 6, 7], "mobil": [4, 5, 6, 8], "tradeoff": [4, 6, 7, 8], "pro": [4, 5, 6, 7], "trigger": [4, 7], "premium": [4, 5], "innov": [4, 5, 6, 7], "capac": [4, 5, 6], "link": [4, 5], "dual": 4, "character": [4, 5, 7], "ahead": [4, 6, 7], "decai": [4, 6], "discuss": [4, 5, 6, 7], "area": [4, 5, 7, 8], "flash": [4, 6], "cach": [4, 5, 6], "compress": [4, 5, 6], "provis": [4, 5], "extent": [4, 5, 7], "problema": 4, "accomplish": [4, 7, 8], "accompani": [4, 5, 7], "transact": [4, 5, 7], "roi": 4, "alloc": [4, 5, 6, 7], "budget": [4, 6], "viabil": [4, 6], "prioriti": [4, 5, 6], "overlook": 4, "thorough": [4, 6, 7], "identif": [4, 5], "specifi": [4, 5, 6, 7, 8], "longev": 4, "accommod": 4, "evalu": [4, 6, 8], "multi": [4, 5, 6, 7, 8], "baselin": [4, 5, 6, 7], "met": [4, 5, 7], "equal": [4, 5, 7], "concurr": [4, 6], "peak": 4, "spike": 4, "versu": [4, 5, 6, 7], "volum": [4, 5, 6, 7], "season": [4, 5], "variat": [4, 5, 6], "uptim": 4, "mainten": [4, 5, 6, 7], "disrupt": [4, 5], "backup": 4, "failov": 4, "clearli": [4, 5, 7, 8], "redund": [4, 5], "recoveri": [4, 5], "unexpect": [4, 5, 7, 8], "event": [4, 5], "seamless": [4, 5, 7], "broader": [4, 5, 6, 7], "vector": [4, 6, 7], "retriev": [4, 5, 6], "augment": [4, 5, 6], "rag": [4, 6], "retent": [4, 5], "polici": [4, 5, 6], "essenti": [4, 5, 6, 7, 8], "opportun": [4, 5], "post": [4, 5, 6, 7], "32": [4, 5, 6], "fp32": 4, "fp16": [4, 6], "proport": [4, 5, 6], "byte": 4, "120": [4, 5, 7], "gb": 4, "whole": [4, 5], "done": [4, 5, 6, 7, 8], "smollm2": [4, 5, 6, 8], "135m": [4, 6], "load_gguf": 4, "bartowski": 4, "gguf": [4, 6], "gguf_file_q2_k": 4, "q2_k": [4, 6], "gguf_file_f16": 4, "f16": 4, "model_q2_k": 4, "gguf_fil": 4, "model_f16": 4, "mlp": 4, "layer": [4, 5, 6, 8], "proxi": [4, 5, 7], "mlp_weights_q2_k": 4, "gate_proj": 4, "mlp_weights_f16": 4, "tensor": [4, 8], "0145": 4, "1826": 4, "1377": 4, "1719": 4, "1387": 4, "0298": 4, "1631": 4, "0781": 4, "2051": [4, 5], "2070": 4, "0334": 4, "2891": 4, "1768": 4, "0488": 4, "2393": 4, "0396": 4, "1348": 4, "1533": 4, "0771": 4, "0845": 4, "0232": 4, "0178": 4, "1040": 4, "1582": 4, "1167": 4, "0474": 4, "0359": 4, "2500": 4, "0432": 4, "0972": 4, "0933": 4, "2188": 4, "0776": 4, "0674": 4, "requires_grad": 4, "0028": 4, "1852": 4, "1396": 4, "1506": 4, "1635": 4, "0043": 4, "0680": 4, "2257": 4, "1890": 4, "0464": 4, "2960": 4, "1840": 4, "0451": 4, "2395": 4, "0413": 4, "1446": 4, "0621": 4, "0478": 4, "0038": 4, "0830": 4, "1473": 4, "0926": 4, "0547": 4, "0824": 4, "0429": 4, "2737": 4, "0355": 4, "0782": 4, "2043": [4, 5], "0740": 4, "arriv": [4, 5], "pearson": 4, "numpi": [4, 5], "np": [4, 5], "arrai": [4, 7], "detach": 4, "graph": [4, 5], "weights_f16": 4, "weights_q2_k": 4, "flat_f16": 4, "flatten": 4, "flat_q2_k": 4, "corrcoef": 4, "4f": [4, 8], "9970": 4, "exemplifi": [4, 6, 7], "70b": [4, 5, 6], "unsloth": 4, "141": 4, "q8_0": [4, 6], "75": [4, 7], "47": [4, 5, 6, 7], "cumul": [4, 5], "26": [4, 5, 6], "19": [4, 5, 6, 7], "space": [4, 5, 6, 7], "counterpart": 4, "spectrum": [4, 5], "variant": [4, 5, 6, 7], "laptop": [4, 5], "desktop": [4, 5, 6], "enterpris": [4, 5, 6, 7, 8], "ceil": 4, "notabl": [4, 5, 7, 8], "bitnet": 4, "cpp": [4, 8], "arm": 4, "x86": 4, "speedup": [4, 6], "37x": 4, "07x": 4, "17x": 4, "beyond": [4, 5, 7], "raw": [4, 5, 6, 7, 8], "speed": [4, 5, 6, 7], "energi": [4, 5], "55": [4, 5, 6], "70": [4, 5, 6], "71": [4, 5], "82": [4, 7], "impress": [4, 6, 8], "100b": 4, "b1": 4, "58": [4, 6], "pace": [4, 5, 7], "second": [4, 5, 6, 7], "kernel": 4, "characterist": [4, 5, 6, 7, 8], "excit": 4, "frontier": [4, 7], "compel": [4, 5, 6, 8], "acceler": [4, 5, 6, 7], "faster": [4, 6], "arithmet": [4, 5], "benefici": [4, 5, 6], "sustain": [4, 5, 6, 7], "Be": [4, 5, 6, 7], "fine": [4, 5, 7], "pure": [4, 5, 6], "unlock": [4, 8], "track": [4, 5, 7], "chargeback": 4, "regularli": [4, 5], "wz": 4, "jinheng": 4, "hansong": 4, "ting": [4, 7], "shaoguang": 4, "shume": 4, "ma": [4, 5, 7], "hongyu": [4, 5], "xia": [4, 5, 6], "infra": 4, "fast": [4, 5, 6, 7, 8], "lossless": 4, "16144": 4, "andreessenhorowitz24": 4, "huggingface4w": [4, 6], "2024w": [4, 6], "unsloth24": 4, "jonathan": [4, 5, 7], "ceo": [4, 5], "groq": [4, 6], "streamlin": [4, 6, 8], "notat": 4, "width": [4, 6], "_k": 4, "_0": 4, "matter": 5, "beauti": 5, "smart": [5, 7], "agre": 5, "wrong": 5, "feynman": 5, "advent": 5, "norm": 5, "realm": 5, "convent": [5, 7], "evolut": [5, 6], "conceiv": 5, "entrench": 5, "seem": 5, "daunt": 5, "ignor": 5, "outdat": [5, 7, 8], "inevit": 5, "setback": 5, "imper": 5, "embrac": 5, "proactiv": [5, 7], "mindset": 5, "front": [5, 6], "produc": [5, 6, 7, 8], "novel": [5, 6], "ident": 5, "isn": [5, 7], "bug": 5, "random": [5, 7, 8], "testabl": 5, "exceedingli": 5, "guarante": [5, 6, 7, 8], "primari": [5, 7], "nucleu": 5, "2020": 5, "summari": [5, 6, 7, 8], "alter": 5, "rigid": 5, "wildli": 5, "incoher": 5, "inadequ": [5, 7], "temp": 5, "df_result": 5, "ntemperatur": 5, "40": [5, 6], "temp_respons": 5, "iterrow": [5, 7], "10000": [5, 8], "appl": [5, 8], "txt": [5, 6, 8], "sec_fil": [5, 8], "nsecur": 5, "AND": [5, 8], "exchang": [5, 7, 8], "commiss": [5, 7, 8], "nwashington": 5, "20549": 5, "nform": 5, "pursuant": 5, "TO": [5, 7], "13": [5, 6, 7], "OR": 5, "OF": [5, 7], "THE": [5, 7], "1934": 5, "nfor": 5, "fiscal": 5, "septemb": 5, "28": [5, 6, 7], "nor": 5, "period": [5, 7], "ncommiss": 5, "001": [5, 6], "36743": 5, "ng66145g66i43": 5, "jpg": 5, "nappl": 5, "exact": [5, 6, 7], "registr": 5, "charter": 5, "ncalifornia": 5, "t94": 5, "2404110": 5, "jurisdict": 5, "nof": 5, "incorpor": [5, 6, 7, 8], "employ": 5, "park": 5, "ncupertino": 5, "california": [5, 7, 8], "n95014": 5, "princip": 5, "offic": [5, 7], "408": 5, "996": 5, "1010": 5, "telephon": 5, "regist": 5, "ntitl": 5, "ttrade": 5, "symbol": 5, "tname": 5, "ncommon": 5, "stock": [5, 8], "00001": 5, "naapl": 5, "tthe": 5, "nasdaq": [5, 8], "llc": [5, 8], "n0": 5, "000": [5, 6, 8], "note": [5, 6, 7, 8], "2025": 5, "875": 5, "625": 5, "2026": 5, "2027": 5, "375": 5, "2029": 5, "050": 5, "2031": [5, 7], "600": 5, "2042": 5, "nindic": 5, "issuer": 5, "405": 5, "nye": 5, "preced": [5, 8], "shorter": 5, "past": [5, 7], "90": [5, 6, 7], "submit": [5, 6, 7], "electron": 5, "232": 5, "filer": 5, "12b": [5, 7], "nlarg": 5, "tacceler": 5, "nnon": 5, "tsmaller": 5, "nemerg": 5, "nif": 5, "elect": [5, 7], "revis": [5, 7], "attest": 5, "404": 5, "sarban": 5, "oxlei": 5, "7262": 5, "firm": [5, 7], "prepar": [5, 6, 7], "correct": [5, 7], "restat": 5, "incent": 5, "compens": 5, "240": 5, "10d": 5, "shell": 5, "aggreg": [5, 7], "vote": 5, "held": [5, 8], "affili": [5, 8], "29": [5, 6, 7, 8], "last": [5, 7, 8], "quarter": 5, "628": [5, 8], "553": [5, 8], "sole": [5, 7], "disclosur": [5, 6, 7], "director": [5, 6, 7], "date": 5, "exclud": 5, "n15": 5, "115": [5, 8], "823": [5, 8], "outstand": [5, 8], "octob": [5, 8], "18": [5, 6, 7, 8], "ndocument": 5, "BY": 5, "nportion": 5, "meet": [5, 7, 8], "sharehold": 5, "iii": 5, "ntabl": 5, "npage": 5, "npart": 5, "nitem": 5, "nbusi": 5, "1a": 5, "nrisk": 5, "1b": [5, 6, 7], "nunresolv": 5, "staff": 5, "comment": 5, "n17": 5, "1c": 5, "ncybersecur": 5, "nproperti": 5, "n18": 5, "nlegal": 5, "proceed": [5, 7], "nmine": 5, "ii": [5, 6, 8], "nmarket": 5, "stockhold": 5, "purchas": [5, 7], "n19": 5, "reserv": 5, "n20": 5, "nmanag": 5, "n21": 5, "7a": 5, "nquantit": 5, "n27": 5, "nfinanci": 5, "supplementari": 5, "n28": 5, "nchang": 5, "disagr": 5, "n51": 5, "9a": 5, "ncontrol": 5, "procedur": [5, 7], "9b": 5, "nother": 5, "n52": 5, "9c": 5, "ndisclosur": 5, "foreign": 5, "ndirector": 5, "corpor": [5, 7], "nexecut": 5, "ownership": [5, 6], "certain": [5, 7, 8], "owner": 5, "ncertain": 5, "nprincip": 5, "fee": 5, "iv": 5, "nexhibit": 5, "n53": 5, "n56": 5, "nthi": 5, "litig": [5, 6], "reform": 5, "1995": 5, "uncertainti": [5, 6, 7], "macroeconom": 5, "anticip": [5, 7], "caus": [5, 7], "oblig": 5, "nunless": 5, "herein": 5, "calendar": 5, "wholli": 5, "subsidiari": 5, "unless": [5, 6], "ncompani": 5, "manufactur": 5, "tablet": [5, 6], "wearabl": 5, "accessori": 5, "sell": [5, 7], "varieti": [5, 6], "52": [5, 7], "53": [5, 7], "week": 5, "saturdai": 5, "nproduct": 5, "niphon": 5, "io": [5, 8], "iphon": 5, "se": [5, 7], "nmac": 5, "maco": [5, 6], "mac": [5, 6], "macbook": 5, "air": 5, "imac": 5, "studio": 5, "nipad": 5, "multipurpos": 5, "ipado": 5, "ipad": 5, "nwearabl": 5, "home": [5, 8], "smartwatch": 5, "wireless": 5, "headphon": 5, "spatial": 5, "watcho": 5, "watch": 5, "ultra": 5, "airpod": 5, "beat": [5, 6], "visiono": 5, "nhome": 5, "tv": 5, "tvo": 5, "homepod": 5, "fidel": [5, 8], "naccessori": 5, "brand": 5, "third": [5, 6, 7], "parti": [5, 6, 7], "nservic": 5, "nadvertis": 5, "advertis": 5, "licens": 5, "napplecar": 5, "portfolio": 5, "applecar": 5, "repair": 5, "coverag": [5, 7], "accident": 5, "damag": [5, 7], "theft": [5, 7], "ncloud": 5, "ndigit": 5, "app": [5, 6], "discov": [5, 6, 7], "download": [5, 6], "music": 5, "podcast": 5, "subscript": [5, 6], "arcad": 5, "sm": 5, "listen": [5, 6], "radio": 5, "station": 5, "magazin": 5, "exclus": 5, "sport": 5, "npayment": 5, "payment": 5, "credit": 5, "pai": [5, 6], "cashless": 5, "nsegment": 5, "primarili": [5, 7], "geograph": [5, 7], "basi": [5, 6], "segment": [5, 8], "america": 5, "europ": 5, "china": [5, 6, 7], "japan": 5, "rest": [5, 6], "asia": 5, "pacif": 5, "north": [5, 7], "south": 5, "european": [5, 7], "india": 5, "middl": [5, 6, 7], "east": 5, "africa": 5, "mainland": 5, "kong": 5, "taiwan": 5, "australia": 5, "asian": 5, "although": [5, 6], "partner": [5, 6, 7], "mid": 5, "resel": 5, "retail": 5, "sale": 5, "indirect": 5, "channel": [5, 7], "cellular": 5, "carrier": 5, "net": [5, 8], "38": [5, 6, 7], "ncompetit": 5, "downward": 5, "pressur": [5, 7], "gross": [5, 7], "cycl": [5, 7], "competitor": [5, 6, 7], "compet": [5, 6], "imit": 5, "infring": [5, 6], "intellectu": [5, 6, 7], "marketplac": [5, 7], "nearli": [5, 6], "reput": [5, 7], "expand": [5, 6, 7], "illegitim": [5, 7], "collabor": [5, 6, 7], "nsuppli": 5, "nalthough": 5, "particip": 5, "shortag": 5, "commod": [5, 6], "fluctuat": 5, "commonli": 5, "until": [5, 7, 8], "supplier": 5, "matur": 5, "concentr": 5, "enter": [5, 8], "agreement": 5, "suppli": [5, 8], "renew": 5, "nresearch": 5, "nbecaus": 5, "upon": [5, 7], "flow": [5, 8], "acquisit": [5, 7], "nintellectu": 5, "broad": [5, 6, 8], "patent": 5, "copyright": [5, 6], "trademark": 5, "secret": 5, "differenti": 5, "skill": [5, 7], "personnel": 5, "pursu": [5, 7], "thousand": [5, 6], "durat": 5, "adequ": [5, 7], "nin": 5, "holidai": [5, 7], "fill": 5, "inventori": 5, "older": [5, 6], "newer": 5, "distributor": 5, "nhuman": 5, "strive": 5, "retain": [5, 6, 7], "talent": 5, "member": [5, 7], "164": 5, "ncompens": 5, "equit": 5, "succe": 5, "health": [5, 7], "awai": [5, 7], "ngrowth": 5, "career": 5, "leadership": [5, 7], "nworkplac": 5, "workplac": 5, "ninclus": 5, "workforc": 5, "nengag": 5, "among": [5, 6, 7, 8], "gaug": 5, "sentiment": [5, 6, 8], "nhealth": 5, "everywher": 5, "crisi": 5, "visitor": 5, "navail": 5, "quarterli": 5, "q": [5, 6, 7], "amend": 5, "sec": [5, 8], "Such": [5, 7], "charg": 5, "investor": [5, 8], "aspx": 5, "websit": [5, 6, 7], "environment": [5, 7], "referenc": 5, "inact": 5, "textual": 5, "unknown": [5, 7], "advers": 5, "conjunct": 5, "consolid": 5, "nmacroeconom": 5, "facil": 5, "assembli": 5, "site": [5, 8], "nadvers": 5, "slow": 5, "recess": 5, "unemploy": 5, "inflat": 5, "tighter": 5, "currenc": 5, "monetari": 5, "contract": [5, 6], "logist": 5, "instabl": [5, 7], "inabl": 5, "financ": [5, 6, 7], "insolv": 5, "counterparti": 5, "debt": 5, "liquid": 5, "fair": [5, 7], "instrument": 5, "polit": [5, 7], "disput": 5, "geopolit": 5, "tension": [5, 7], "terror": 5, "accid": 5, "interrupt": 5, "npolit": 5, "outsourc": 5, "korea": 5, "vietnam": 5, "restrict": [5, 6, 7, 8], "tariff": 5, "export": 5, "portion": [5, 6], "revenu": [5, 8], "restructur": 5, "ceas": 5, "escal": [5, 7], "nmani": 5, "prone": [5, 7], "earthquak": 5, "climat": 5, "weather": 5, "plant": 5, "terrorist": [5, 7], "attack": [5, 7], "hostil": 5, "ransomwar": 5, "cybersecur": [5, 7], "labor": 5, "nsuch": 5, "imposs": [5, 6], "slowdown": 5, "outag": 5, "neg": [5, 7, 8], "pandem": 5, "covid": 5, "economi": 5, "imposit": 5, "stringent": [5, 6, 7], "travel": 5, "freight": 5, "movement": 5, "ramp": 5, "nfollow": 5, "expenditur": 5, "resum": 5, "exacerb": 5, "insur": 5, "nglobal": 5, "unabl": 5, "assur": [5, 7], "minor": [5, 7], "naddition": 5, "intensifi": 5, "seamlessli": 5, "nto": 5, "stimul": 5, "ndue": 5, "upgrad": 5, "quantiti": 5, "defect": 5, "defici": 5, "supersed": 5, "nsubstanti": 5, "transport": 5, "reimburs": 5, "warranti": 5, "unanticip": 5, "liabil": 5, "final": [5, 7, 8], "finish": [5, 7], "destin": 5, "prepay": 5, "termin": [5, 6], "recover": 5, "exposur": [5, 7], "nfutur": 5, "semiconductor": 5, "suffer": [5, 7], "constrain": [5, 6, 8], "shipment": 5, "unexpectedli": 5, "interfer": 5, "unsaf": [5, 7], "expos": [5, 7], "widespread": [5, 7], "vulner": [5, 7], "compromis": [5, 6, 7], "claim": [5, 6, 7], "intang": 5, "lost": [5, 7], "cancel": 5, "obsolet": 5, "exce": [5, 7], "realiz": 5, "accru": 5, "excess": 5, "impair": 5, "whenev": 5, "circumst": 5, "amount": [5, 7, 8], "carri": [5, 6, 8], "incur": 5, "unpredict": [5, 7], "obsolesc": 5, "forecast": [5, 7], "incorrectli": [5, 7, 8], "extens": [5, 6, 8], "issuanc": 5, "unknowingli": [5, 7], "notifi": 5, "preclud": 5, "bui": 5, "percept": 5, "android": 5, "playstat": 5, "nintendo": 5, "xbox": 5, "inclin": 5, "devot": 5, "dissatisfi": 5, "vast": [5, 7], "storefront": 5, "safari": 5, "union": [5, 7], "eu": [5, 7], "dma": 5, "narrow": [5, 6, 7], "scope": [5, 6, 7], "elimin": [5, 6], "nfailur": 5, "appeal": 5, "subscrib": 5, "nsome": 5, "manner": [5, 7], "nurtur": 5, "nmuch": 5, "chief": 5, "silicon": 5, "vallei": 5, "constantli": 5, "driver": [5, 6], "recruit": 5, "subsidi": 5, "staf": 5, "contractor": 5, "placement": 5, "increment": 5, "weaken": 5, "telecommun": 5, "war": 5, "virus": 5, "ins": 5, "incid": [5, 7], "ineffect": 5, "thing": [5, 8], "interf": 5, "imped": 5, "ship": 5, "nloss": 5, "unauthor": [5, 7], "confidenti": [5, 6], "encrypt": 5, "But": [5, 7, 8], "behalf": 5, "normal": [5, 7, 8], "investig": [5, 7], "penalti": [5, 6], "frequenc": [5, 6, 7], "actor": [5, 7], "circumv": [5, 7], "obfusc": 5, "forens": 5, "hinder": [5, 8], "recov": 5, "perpetr": 5, "profil": [5, 6], "authent": 5, "hack": [5, 7], "malfeas": 5, "faulti": 5, "password": 5, "irregular": 5, "fraudul": 5, "induc": 5, "disclos": [5, 8], "usernam": 5, "turn": [5, 7], "multifactor": 5, "unusu": 5, "freez": 5, "suspici": 5, "nwhile": 5, "ninvest": 5, "ongo": [5, 6], "contempl": 5, "endeavor": 5, "distract": 5, "tangibl": 5, "approv": 5, "oner": 5, "ventur": 5, "riski": 5, "leas": 5, "unfavor": 5, "arisen": 5, "ordinari": 5, "cours": [5, 6, 7], "resolv": [5, 6, 7], "sometim": [5, 8], "indemnif": 5, "indemnifi": 5, "alleg": 5, "magnitud": 5, "assert": 5, "royalti": 5, "vigor": 5, "defend": 5, "court": [5, 6], "internation": 5, "plaintiff": 5, "injunct": 5, "relief": 5, "nregardless": 5, "merit": 5, "recognit": [5, 6, 7], "settl": 5, "uncertain": 5, "disgorg": 5, "remedi": [5, 7], "worldwid": 5, "antitrust": 5, "bill": 5, "commerc": 5, "televis": 5, "film": 5, "anticorrupt": 5, "cash": 5, "repatri": 5, "launder": 5, "tax": 5, "wast": 5, "recycl": 5, "ncomplianc": 5, "impos": [5, 6, 7, 8], "agent": [5, 6, 7], "nregulatori": 5, "ban": [5, 7], "nexpect": 5, "increasingli": [5, 6, 7, 8], "greenhous": 5, "ga": 5, "emiss": 5, "civil": 5, "disagre": 5, "perceiv": 5, "feder": 5, "nfrom": 5, "noncompli": 5, "individu": [5, 6, 7], "lawsuit": [5, 6], "monopol": 5, "nfurther": 5, "earn": 5, "search": [5, 6, 7], "nthere": 5, "transfer": 5, "pass": [5, 6, 7, 8], "pend": 5, "inquiri": [5, 7], "government": 5, "entiti": [5, 6, 7, 8], "biometr": 5, "notif": 5, "permit": [5, 6, 8], "healthcar": [5, 6], "liabl": 5, "investigatori": 5, "cardhold": 5, "acquir": 5, "denomin": 5, "offset": 5, "strengthen": [5, 7], "nconvers": 5, "thu": 5, "hedg": 5, "deterior": 5, "sovereign": 5, "heighten": [5, 7], "worsen": 5, "A": [5, 6, 7, 8], "collater": 5, "bank": 5, "unsecur": 5, "subassembli": 5, "assembl": 5, "legisl": 5, "ireland": [5, 7], "singapor": 5, "organis": 5, "statutori": 5, "valuat": 5, "defer": 5, "bodi": [5, 7], "adequaci": 5, "ow": 5, "ngener": 5, "repurchas": 5, "dividend": 5, "consumm": 5, "declar": 5, "board": [5, 7], "unresolv": 5, "nnone": 5, "threat": [5, 7], "postur": 5, "25": [5, 6, 7], "2016": 5, "coordin": [5, 7], "committe": [5, 7], "oversight": [5, 7], "counsel": 5, "chair": 5, "headquart": 5, "cupertino": [5, 8], "center": [5, 7, 8], "formal": [5, 7, 8], "conclud": [5, 6], "uninstal": 5, "web": [5, 6, 7], "browser": 5, "june": 5, "contractu": 5, "desist": 5, "stai": [5, 6], "grant": 5, "ndepart": 5, "justic": 5, "depart": [5, 7], "doj": 5, "district": 5, "attornei": 5, "jersei": 5, "redress": [5, 7], "anticompetit": 5, "nonmonetari": 5, "defens": [5, 7], "nepic": 5, "epic": 5, "northern": 5, "unfair": [5, 7], "enjoin": 5, "extern": [5, 7], "januari": 5, "motion": 5, "oppos": [5, 7], "vacat": 5, "fourth": 5, "mine": 5, "nnot": 5, "aapl": 5, "nholder": 5, "na": [5, 7], "301": 5, "npurchas": 5, "nshare": 5, "nperiod": 5, "ttotal": 5, "taverag": 5, "npaid": 5, "nannounc": 5, "napproxim": 5, "That": [5, 7, 8], "nunder": 5, "njune": 5, "august": [5, 7], "nopen": 5, "negoti": [5, 7], "t35": 5, "697": 5, "t224": 5, "naugust": 5, "31": [5, 6], "t42": 5, "910": 5, "t221": 5, "39": [5, 6], "nseptemb": 5, "t33": 5, "653": 5, "t222": 5, "86": [5, 6], "ntotal": [5, 7], "t112": 5, "260": 5, "t89": 5, "074": 5, "110": 5, "10b5": 5, "reinvest": 5, "dow": 5, "supersector": 5, "27": [5, 6, 7], "2019": 5, "n2218": 5, "tseptemb": 5, "t100": 5, "t207": 5, "t273": 5, "t281": 5, "t322": 5, "t430": 5, "t113": 5, "t156": 5, "t131": 5, "t155": 5, "t210": 5, "ndow": 5, "t146": 5, "t216": 5, "t215": 5, "nfirst": 5, "nsecond": 5, "nthird": 5, "sequoia": 5, "nfourth": 5, "plu": [5, 6], "nfiscal": 5, "six": 5, "realign": 5, "span": [5, 6, 7], "indirectli": 5, "n2024": 5, "tchang": 5, "t2023": 5, "t2022": 5, "namerica": 5, "t167": 5, "045": 5, "t3": 5, "t162": 5, "560": 5, "t169": 5, "658": 5, "neurop": 5, "t101": 5, "328": 5, "t7": 5, "294": 5, "t95": 5, "118": 5, "ngreater": 5, "t66": 5, "952": 5, "t72": 5, "559": 5, "t74": 5, "njapan": 5, "t25": 5, "052": 5, "t24": 5, "257": 5, "977": 5, "nrest": 5, "t30": 5, "t4": 5, "t29": 5, "615": 5, "t1": 5, "t391": 5, "035": 5, "t2": 5, "t383": 5, "285": 5, "t394": 5, "weak": [5, 7], "renminbi": 5, "yen": [5, 8], "t201": 5, "183": 5, "t200": 5, "583": 5, "t205": 5, "489": 5, "984": 5, "357": 5, "t40": 5, "177": [5, 7], "t26": 5, "694": 5, "t28": 5, "300": 5, "292": 5, "t37": 5, "005": 5, "t39": 5, "845": [5, 7], "t41": 5, "241": 5, "n96": 5, "169": 5, "t13": 5, "t85": 5, "t9": 5, "t78": 5, "129": [5, 7], "amort": 5, "bundl": 5, "flat": 5, "ngross": 5, "t109": 5, "633": 5, "t108": 5, "803": 5, "t114": 5, "728": 5, "t71": 5, "t60": 5, "345": 5, "t56": 5, "054": 5, "t180": 5, "683": 5, "148": 5, "t170": 5, "782": 5, "t36": 5, "t73": 5, "t70": 5, "t46": 5, "t44": 5, "t43": 5, "noper": 5, "t31": 5, "370": 5, "t5": 5, "915": 5, "t14": 5, "251": 5, "npercentag": 5, "t8": 5, "nsell": 5, "administr": 5, "097": 5, "932": 5, "094": 5, "t6": 5, "t57": 5, "467": 5, "t54": 5, "847": 5, "t51": 5, "t15": 5, "headcount": 5, "nprovis": 5, "749": 5, "t16": 5, "741": 5, "t19": 5, "neffect": 5, "nstatutori": 5, "t21": 5, "aid": [5, 7], "nliquid": 5, "unrestrict": 5, "140": 5, "ndebt": 5, "97": [5, 7], "payabl": 5, "promissori": 5, "nleas": 5, "nmanufactur": 5, "noncancel": 5, "ndeem": 5, "tcja": 5, "nstate": 5, "fund": [5, 6], "escrow": 5, "ncapit": 5, "95": [5, 7], "nrecent": 5, "pronounc": 5, "nincom": 5, "fasb": 5, "asu": 5, "09": [5, 7], "740": 5, "reconcili": 5, "reconcil": [5, 8], "disaggreg": 5, "prospect": 5, "novemb": [5, 7], "07": [5, 7, 8], "280": 5, "maker": 5, "codm": 5, "retrospect": 5, "ncritic": 5, "conform": [5, 8], "gaap": 5, "nuncertain": 5, "domest": 5, "taxat": 5, "resolut": 5, "conting": 5, "ninterest": 5, "forth": 5, "hypothet": 5, "nsensit": 5, "nhypothet": 5, "nrate": 5, "npotenti": 5, "n100": 5, "tenor": 5, "ndeclin": 5, "755": 5, "089": 5, "nterm": 5, "nincreas": 5, "t139": 5, "t194": 5, "nforeign": 5, "var": 5, "mont": 5, "carlo": 5, "interv": 5, "538": 5, "669": 5, "nindex": 5, "tpage": 5, "nconsolid": 5, "n29": 5, "n30": 5, "sheet": 5, "n31": 5, "n32": 5, "n33": 5, "nnote": 5, "n34": 5, "nreport": 5, "n48": 5, "nall": 5, "omit": [5, 8], "submiss": 5, "nyear": 5, "n2023": 5, "n2022": 5, "nnet": 5, "t294": 5, "866": 5, "t298": 5, "085": 5, "t316": 5, "199": 5, "t96": 5, "ncost": 5, "t185": 5, "233": 5, "t189": 5, "282": 5, "471": 5, "119": 5, "855": 5, "t22": 5, "075": 5, "352": 5, "t214": 5, "137": 5, "t223": 5, "546": 5, "t123": 5, "216": 5, "t119": 5, "437": 5, "t269": 5, "565": 5, "334": 5, "485": 5, "736": 5, "103": 5, "t93": 5, "995": 5, "t99": 5, "nearn": 5, "nbasic": 5, "ndilut": 5, "08": [5, 6, 8], "343": [5, 7], "783": 5, "744": 5, "215": 5, "963": 5, "095": 5, "812": 5, "547": 5, "325": 5, "819": 5, "nsee": 5, "translat": [5, 6, 7], "t395": 5, "765": 5, "511": 5, "unreal": 5, "832": 5, "t323": 5, "212": 5, "nadjust": 5, "337": 5, "717": 5, "394": 5, "138": 5, "850": 5, "563": 5, "104": 5, "t204": 5, "t253": 5, "816": 5, "899": 5, "272": 5, "t98": 5, "016": 5, "652": 5, "t88": 5, "531": 5, "nasset": 5, "ncurrent": 5, "ncash": 5, "943": 5, "965": 5, "228": 5, "590": 5, "naccount": 5, "410": 5, "508": 5, "nvendor": 5, "t32": 5, "833": 5, "477": 5, "ninventori": 5, "286": 5, "331": 5, "287": 5, "695": 5, "t152": 5, "987": 5, "t143": 5, "566": 5, "t91": 5, "479": 5, "544": 5, "t45": 5, "680": 5, "715": 5, "834": 5, "t64": 5, "758": 5, "t211": 5, "993": 5, "t209": 5, "017": 5, "t364": 5, "980": [5, 7], "t352": 5, "nliabil": 5, "t68": 5, "960": 5, "t62": 5, "611": 5, "304": 5, "t58": 5, "829": 5, "ndefer": 5, "249": 5, "061": 5, "ncommerci": 5, "967": 5, "985": 5, "t10": 5, "912": 5, "822": 5, "t176": 5, "392": 5, "t145": 5, "308": 5, "750": 5, "888": 5, "t49": 5, "848": 5, "638": 5, "t308": 5, "030": [5, 6], "t290": 5, "ncommit": 5, "nsharehold": 5, "400": 5, "116": 5, "786": 5, "550": 5, "n83": 5, "276": 5, "naccumul": 5, "deficit": 5, "154": 5, "214": 5, "172": 5, "452": 5, "950": 5, "146": [5, 7], "t50": 5, "672": 5, "t63": 5, "090": 5, "nbegin": 5, "849": 5, "365": 5, "423": 5, "346": 5, "175": 5, "withheld": 5, "settlement": 5, "521": 5, "971": 5, "t12": 5, "034": 5, "t11": 5, "nend": 5, "t83": 5, "nretain": 5, "068": 5, "562": 5, "ndividend": 5, "218": 5, "793": 5, "612": 5, "099": 5, "454": 5, "846": 5, "77": [5, 6], "046": 5, "186": 5, "109": 5, "t163": 5, "rsu": 5, "t0": 5, "98": [5, 6], "94": [5, 6, 7], "737": 5, "929": 5, "ndepreci": 5, "445": 5, "519": 5, "688": 5, "038": 5, "266": 5, "227": 5, "006": 5, "788": 5, "356": 5, "271": 5, "520": 5, "618": 5, "484": 5, "731": 5, "684": 5, "499": 5, "020": 5, "889": 5, "448": 5, "552": 5, "031": 5, "t118": 5, "254": 5, "t110": 5, "543": 5, "t122": 5, "151": 5, "48": [5, 6], "656": 5, "513": 5, "76": [5, 7], "923": 5, "nproce": 5, "211": 5, "686": 5, "917": 5, "135": 5, "828": 5, "446": 5, "447": 5, "959": 5, "708": 5, "086": 5, "935": 5, "705": 5, "354": 5, "nfinanc": 5, "441": 5, "431": 5, "223": [5, 7], "234": [5, 7], "025": 5, "841": 5, "nrepurchas": 5, "949": 5, "89": [5, 7], "402": 5, "465": 5, "nrepay": 5, "958": 5, "repay": 5, "978": 5, "955": 5, "361": 5, "581": 5, "160": 5, "121": 5, "983": 5, "488": 5, "794": 5, "760": 5, "nsupplement": 5, "102": 5, "t18": 5, "679": 5, "573": 5, "33": [5, 6, 7], "nbasi": 5, "prior": [5, 7], "reclassifi": 5, "nrevenu": 5, "remit": [5, 7], "straight": 5, "vest": 5, "sold": 5, "nderiv": 5, "nonleas": 5, "34": [5, 7], "entitl": 5, "commenc": 5, "deliveri": 5, "stand": 5, "ssp": 5, "icloud": 5, "siri": 5, "discount": 5, "undeliv": 5, "unbil": 5, "n26": 5, "n37": 5, "moder": [5, 6], "64": [5, 6, 7], "dilut": 5, "nnumer": 5, "ndenomin": 5, "nweight": 5, "312": 5, "316": 5, "856": 5, "antidilut": 5, "tunreal": 5, "ngain": 5, "tfair": 5, "nvalu": 5, "tcash": 5, "nequival": 5, "tcurrent": 5, "tnon": 5, "t27": 5, "nlevel": 5, "nmonei": 5, "t778": 5, "nmutual": 5, "n515": 5, "t105": 5, "t617": 5, "nsubtot": 5, "293": 5, "395": 5, "nu": 5, "treasuri": 5, "516": 5, "t212": 5, "087": 5, "380": 5, "159": 5, "t703": 5, "t17": 5, "568": 5, "158": 5, "810": 5, "ncertif": 5, "deposit": 5, "t873": 5, "t387": 5, "t478": 5, "066": 5, "ncorpor": 5, "t65": 5, "622": 5, "t270": 5, "953": 5, "939": 5, "027": 5, "t47": 5, "886": 5, "nmunicip": 5, "t412": 5, "t405": 5, "t190": 5, "nmortgag": 5, "595": 5, "t175": 5, "403": 5, "t23": 5, "367": 5, "278": [5, 7], "t132": 5, "t583": 5, "635": 5, "t128": 5, "056": 5, "966": 5, "t34": 5, "t160": 5, "t688": 5, "650": 5, "36": [5, 6, 7], "359": [5, 7], "t481": 5, "n442": 5, "t428": 5, "t923": 5, "t909": 5, "406": 5, "114": 5, "468": 5, "136": 5, "t271": 5, "533": 5, "048": [5, 6], "491": 5, "332": 5, "t320": 5, "t608": 5, "t76": 5, "840": 5, "956": 5, "890": 5, "t20": 5, "627": 5, "243": 5, "t628": 5, "t602": 5, "t192": 5, "t410": 5, "735": 5, "636": 5, "t344": 5, "t144": 5, "470": 5, "657": 5, "831": 5, "125": 5, "162": 5, "t173": 5, "752": 5, "corrobor": 5, "mortgag": 5, "classifi": [5, 7], "37": [5, 6, 7], "swap": 5, "remeasur": 5, "notion": 5, "069": 5, "730": 5, "575": 5, "493": 5, "t104": 5, "777": 5, "nhedg": 5, "433": 5, "505": 5, "247": [5, 7], "ntrade": 5, "41": [5, 6, 7], "44": [5, 7], "depreci": 5, "nland": 5, "690": 5, "nmachineri": 5, "t80": 5, "205": [5, 6], "314": 5, "nleasehold": 5, "839": 5, "599": 5, "73": [5, 6, 7], "884": 5, "852": 5, "t55": 5, "906": 5, "601": 5, "703": 5, "010": 5, "457": 5, "634": 5, "391": 5, "neuropean": 5, "opinion": [5, 7], "1991": 5, "2007": 5, "irish": 5, "branch": 5, "2003": 5, "2014": 5, "2015": 5, "minist": 5, "juli": [5, 7], "annul": 5, "ecj": 5, "hear": 5, "asid": 5, "confirm": 5, "unrecogn": 5, "nfeder": 5, "571": 5, "080": 5, "644": 5, "265": 5, "801": 5, "726": 5, "570": 5, "298": 5, "49": [5, 7], "t84": 5, "428": 5, "603": 5, "483": [5, 7], "t347": 5, "t669": 5, "076": 5, "830": 5, "419": 5, "072": 5, "pretax": 5, "72": [5, 7], "ncomput": 5, "885": 5, "012": 5, "124": 5, "518": 5, "nimpact": 5, "246": 5, "311": 5, "366": 5, "397": 5, "nexcess": 5, "893": 5, "871": 5, "192": [5, 7], "739": 5, "ntax": 5, "carryforward": 5, "302": 5, "naccru": 5, "413": [5, 7], "421": 5, "nunreal": 5, "173": 5, "168": 5, "873": 5, "743": 5, "nless": 5, "374": 5, "007": 5, "369": 5, "551": 5, "998": 5, "nright": 5, "179": 5, "nminimum": 5, "674": 5, "940": 5, "t511": 5, "t455": 5, "t490": 5, "805": 5, "202": 5, "indefinit": 5, "temporari": 5, "727": 5, "044": 5, "284": 5, "ndecreas": 5, "386": 5, "463": 5, "982": 5, "542": 5, "936": 5, "070": 5, "expir": 5, "statut": 5, "229": 5, "494": 5, "closur": 5, "intercompani": 5, "exceed": [5, 7], "multiyear": 5, "exercis": 5, "noncash": 5, "rou": 5, "tfinanci": 5, "t2024": 5, "tother": 5, "661": 5, "tproperti": 5, "015": 5, "303": 5, "676": 5, "t165": 5, "t752": 5, "t859": 5, "430": 5, "842": [5, 7], "tfinanc": 5, "n2025": 5, "820": 5, "t171": 5, "991": 5, "n2026": 5, "914": 5, "n2027": 5, "t59": 5, "733": 5, "n2028": 5, "360": 5, "t38": 5, "398": 5, "n2029": 5, "187": 5, "nthereaft": 5, "t837": 5, "undiscount": 5, "790": 5, "imput": 5, "376": 5, "534": 5, "t896": 5, "borrow": 5, "proce": 5, "nine": [5, 7], "nmatur": 5, "333": 5, "264": 5, "948": 5, "645": 5, "309": 5, "arrear": 5, "namount": 5, "n2013": 5, "nfix": 5, "2062": 5, "t97": 5, "341": 5, "03": 5, "65": [5, 7], "t106": 5, "572": 5, "n97": 5, "nunamort": 5, "321": 5, "358": 5, "113": 5, "662": 5, "930": 5, "342": 5, "800": 5, "180": 5, "88": 5, "ndure": 5, "425": 5, "426": 5, "372": 5, "589": 5, "055": 5, "appreci": 5, "four": [5, 6, 7], "holder": [5, 6], "n2014": 5, "bonu": 5, "nrestrict": 5, "nnumber": 5, "nrsu": 5, "ngrant": 5, "naggreg": 5, "nfair": 5, "nbalanc": 5, "t240": 5, "427": [5, 7], "t75": 5, "t150": 5, "861": 5, "501": 5, "768": 5, "87": [5, 6, 7], "101": [5, 7], "878": 5, "144": 5, "t127": 5, "t135": 5, "91": [5, 7], "456": 5, "78": [5, 6, 7], "59": [5, 7], "t140": 5, "326": 5, "t158": 5, "204": 5, "350": 5, "002": [5, 6], "nuncondit": 5, "uncondit": 5, "206": 5, "440": 5, "156": 5, "t633": 5, "t670": 5, "226": 5, "45": 5, "nconting": 5, "accrual": 5, "nconcentr": 5, "attribut": [5, 6, 7, 8], "46": 5, "t67": 5, "098": 5, "082": 5, "062": 5, "569": 5, "895": 5, "458": 5, "207": 5, "nonrecur": 5, "t142": 5, "196": 5, "t138": 5, "t147": 5, "859": 5, "nchina": 5, "n66": 5, "t181": 5, "887": 5, "t172": 5, "269": 5, "nlong": 5, "664": 5, "797": 5, "778": 5, "219": 5, "nopinion": 5, "nwe": 5, "fairli": 5, "pcaob": 5, "sponsor": 5, "treadwai": 5, "2013": 5, "unqualifi": 5, "thereon": 5, "nthese": 5, "misstat": 5, "fraud": [5, 7], "ndescript": 5, "naudit": 5, "nhow": 5, "nmatter": 5, "qualifi": 5, "letter": 5, "advisor": 5, "ernst": 5, "llp": 5, "auditor": 5, "2009": 5, "nsan": 5, "jose": 5, "nnovemb": 5, "coso": 5, "nour": 5, "ndefinit": 5, "disposit": 5, "receipt": 5, "nevalu": 5, "nbase": 5, "supervis": [5, 6, 7, 8], "13a": 5, "15d": 5, "ninher": 5, "paragraph": 5, "51": [5, 7, 8], "ninsid": 5, "deirdr": 5, "brien": 5, "vice": 5, "presid": 5, "affirm": 5, "april": 5, "withhold": 5, "remitt": 5, "mr": 5, "copi": 5, "solicit": 5, "00042": 5, "nincorpor": 5, "texhibit": 5, "descript": [5, 6, 7, 8], "tform": 5, "tfile": 5, "nrestat": 5, "namend": 5, "bylaw": 5, "nindentur": 5, "york": [5, 6, 8], "mellon": 5, "truste": 5, "noffic": 5, "certif": 5, "2018": 5, "85": [5, 6, 7], "05": 5, "2044": 5, "februari": 5, "2045": 5, "900": 5, "700": [5, 6], "250": [5, 7], "2036": 5, "2046": 5, "450": 5, "2047": 5, "2049": 5, "2030": 5, "2050": 5, "2060": 5, "2028": 5, "2041": 5, "2061": 5, "2032": 5, "2052": 5, "54": 5, "2033": 5, "2053": 5, "n12": 5, "nsubsidiari": 5, "n23": 5, "nconsent": 5, "n24": 5, "npower": 5, "signatur": 5, "nrule": 5, "nsection": 5, "1350": 5, "n101": 5, "ninlin": 5, "xbrl": 5, "n104": 5, "inlin": 5, "compensatori": 5, "herewith": 5, "furnish": 5, "herebi": 5, "undertak": 5, "56": [5, 6, 7], "nsignatur": 5, "npursuant": 5, "duli": 5, "undersign": 5, "thereunto": 5, "ndate": 5, "nby": 5, "luca": [5, 8], "maestri": 5, "nluca": 5, "nsenior": 5, "nchief": 5, "nknow": 5, "THESE": 5, "appoint": 5, "cook": 5, "jointli": 5, "her": 5, "substitut": 5, "him": 5, "thereto": 5, "therewith": 5, "ratifi": 5, "virtu": 5, "hereof": 5, "nname": 5, "ttitl": 5, "tdate": 5, "tchief": 5, "tnovemb": 5, "ntimothi": 5, "tsenior": 5, "kondo": 5, "nchri": 5, "wanda": 5, "austin": 5, "nwanda": 5, "gorski": 5, "tdirector": 5, "nalex": 5, "jung": 5, "nandrea": 5, "arthur": 5, "levinson": 5, "narthur": 5, "monica": 5, "lozano": 5, "nmonica": 5, "ronald": 5, "sugar": 5, "nronald": 5, "susan": 5, "wagner": 5, "nsusan": 5, "57": [5, 6], "turbo": [5, 6, 8], "outlin": [5, 6, 7], "invdestacksmeticsisdict": 5, "setispect": 5, "20cyan": 5, "evaluationseld": 5, "anvis": 5, "droitent": 5, "discernminerv": 5, "versbobprefvers": 5, "vo\u8be5": 5, "option\u548c": 5, "meio": 5, "\u0432\u0440\u0435\u043ccisco": 5, "dellaischenpoihscap": 5, "geme": 5, "gettim": 5, "unscal": 5, "vocabulari": [5, 6, 8], "closer": 5, "sharpen": 5, "uniform": 5, "raschka": 5, "repetit": [5, 8], "radic": 5, "grappl": 5, "safer": [5, 7], "fascin": 5, "spontan": 5, "answer": [5, 6, 7, 8], "aren": [5, 6], "linear": 5, "absent": [5, 7], "coax": 5, "journei": 5, "suddenli": 5, "manifest": 5, "deliber": [5, 7], "contend": 5, "rethink": 5, "tutor": 5, "children": [5, 7], "verifi": [5, 6, 8], "predefin": [5, 8], "weren": 5, "kind": 5, "usual": 5, "quantif": 5, "contamin": [5, 7], "unseen": [5, 7], "longitudin": 5, "mostli": [5, 8], "latter": 5, "tailor": [5, 7], "great": [5, 6, 8], "cognit": 5, "misinform": [5, 7], "fabric": [5, 7], "tempor": 5, "disclaim": 5, "referr": 5, "incorrect": [5, 7], "demograph": [5, 7], "stereotyp": [5, 7], "societ": [5, 7], "pii": [5, 7], "anonym": 5, "leakag": [5, 7], "carryov": 5, "fallaci": 5, "think": [5, 6, 7], "idiom": 5, "sarcasm": 5, "terminologi": 5, "lingual": 5, "misunderstand": 5, "syntax": 5, "scan": 5, "compat": [5, 6, 8], "overconfid": 5, "clariti": [5, 7, 8], "audienc": 5, "densiti": 5, "satisfact": [5, 8], "misus": [5, 7], "moral": 5, "co2": 5, "etc": [5, 8], "palm": [5, 6], "easi": [5, 6, 7], "synthet": [5, 6, 7, 8], "timeout": 5, "inter": 5, "rater": 5, "ti": 5, "holist": [5, 7], "experiment": [5, 6, 8], "vi": 5, "categor": [5, 6, 7, 8], "intrins": [5, 6], "extrins": 5, "sequenc": [5, 6, 8], "perplex": [5, 6], "downstream": [5, 8], "synthesi": 5, "discret": 5, "prefix": [5, 7], "roug": 5, "bleu": 5, "bilingu": 5, "understudi": 5, "overlap": 5, "favor": [5, 6, 8], "breviti": 5, "insensit": 5, "semant": [5, 8], "orient": [5, 7], "gist": 5, "meteor": 5, "synonym": 5, "paraphras": 5, "alongsid": [5, 7], "computation": 5, "cider": 5, "consensu": 5, "tf": 5, "idf": 5, "caption": 5, "reliant": 5, "corpu": [5, 6], "ter": 5, "edit": [5, 7], "hypothesi": 5, "penal": 5, "bertscor": 5, "contextu": [5, 7], "bert": 5, "spice": 5, "proposit": [5, 6], "scene": [5, 7], "analyst": 5, "rouge_1": 5, "rouge_2": 5, "ideal": [5, 6, 7, 8], "setup": [5, 6, 7, 8], "evaluate_summari": 5, "unigram": 5, "bigram": 5, "absl": 5, "py": [5, 8], "rouge_scor": 5, "generated_summari": 5, "reference_summari": 5, "google_bleu": 5, "bleu_scor": 5, "rouge1": 5, "rouge2": 5, "arbitrari": 5, "chosen": [5, 7], "sentence1": 5, "cat": [5, 7], "sat": 5, "mat": 5, "sentence2": 5, "ate": 5, "3333333333333333": 5, "7272727272727272": 5, "4444444444444445": 5, "generate_summari": 5, "summir": 5, "liner": 5, "evaluate_summary_model": 5, "model_benchmark": 5, "models_test": 5, "benchmark_summari": 5, "model_summari": 5, "evaluation_result": 5, "statu": 5, "concis": [5, 6], "element": [5, 7, 8], "verbos": [5, 6, 7, 8], "peripher": 5, "quit": [5, 6, 8], "convei": 5, "breadth": 5, "Of": [5, 6, 7], "vibe": 5, "visualize_prompt_comparison": 5, "matplotlib": 5, "radar": 5, "radar_plot": 5, "tmp": 5, "ipykernel_1652501": 5, "940173201": 5, "userwarn": [5, 8], "figurecanvasagg": 5, "largest": [5, 6], "sarmah": 5, "granular": [5, 6], "likert": 5, "ensembl": 5, "repeatedli": 5, "fluenci": 5, "refin": 5, "integ": [5, 8], "rubric": 5, "hollist": 5, "judgeevalu": 5, "grammar": [5, 6, 8], "evaluate_with_llm": 5, "criterion": 5, "judge_model": 5, "candidate_summari": 5, "grammat": 5, "y": [5, 7, 8], "z": 5, "w": [5, 6, 7], "benchmark_model": 5, "test_model": 5, "input_text": [5, 6], "trillion": [5, 6], "evals_list": 5, "1775618912": 5, "slightli": 5, "drift": [5, 7], "lowest": [5, 6], "firstli": 5, "overhead": [5, 6], "egocentr": 5, "tight": 5, "medicin": [5, 7], "glider": 5, "deshpand": 5, "3b": 5, "685": 5, "aplic": 5, "earlier": [5, 7], "depict": [5, 7, 8], "multilingu": [5, 6, 7], "golden": 5, "languang": 5, "arena": 5, "randomli": 5, "customiz": [5, 6, 7], "irrelev": 5, "unhelp": [5, 7], "occasion": 5, "rare": 5, "perfectli": 5, "cater": [5, 6], "critiqu": [5, 7], "elo": 5, "exam": 5, "probe": [5, 7], "certifi": 5, "began": [5, 6], "glue": 5, "entail": [5, 6], "superglu": 5, "successor": 5, "grew": 5, "big": 5, "bench": [5, 6], "srivastava": 5, "truthfulqa": [5, 6], "multitask": 5, "hendryck": [5, 7], "multidisciplinari": 5, "stanford": 5, "helm": 5, "multidimension": 5, "surround": [5, 6, 7, 8], "humanev": [5, 6], "lmsy": 5, "brought": 5, "dialogu": [5, 6], "chiang": 5, "gather": 5, "alpacaev": 5, "duboi": 5, "mt": 5, "argilla": 5, "mila": 5, "mit": [5, 6], "contributor": [5, 6, 8], "western": 5, "centric": 5, "divid": [5, 7], "subset": [5, 7], "agnost": 5, "dialect": 5, "render": [5, 7], "crowdsourc": 5, "livebench": 5, "white": [5, 7], "resili": [5, 7], "meaningfulli": 5, "zebralog": 5, "grid": 5, "puzzl": 5, "brailsford": 5, "1999": 5, "lsat": 5, "hous": 5, "clue": 5, "deduct": 5, "programmat": [5, 8], "2x2": 5, "6x6": 5, "shot": [5, 7, 8], "reductio": 5, "ad": [5, 6, 7, 8], "absurdum": 5, "hard": 5, "10b": 5, "counterfactu": 5, "came": 5, "arc": 5, "prize": [5, 7], "chollet": 5, "mike": [5, 7], "knoop": 5, "founder": 5, "zapier": 5, "fran\u00e7oi": 5, "creator": [5, 6], "agi": 5, "kera": 5, "genuin": 5, "possess": 5, "elementari": 5, "novelti": 5, "interpol": 5, "synthes": 5, "fly": 5, "brute": 5, "pixel": 5, "unbeaten": 5, "win": [5, 6], "poorli": 5, "recombin": 5, "spur": [5, 7], "takeawai": 5, "vertic": [5, 7], "finbench": 5, "legalbench": 5, "guha": 5, "berkelei": 5, "bfcl": 5, "patil": 5, "fourrier": 5, "bespok": 5, "sdk": 5, "autoregress": 5, "sub": [5, 6], "liter": 5, "disturb": 5, "zero": [5, 6, 7, 8], "varianc": [5, 7], "yt": 5, "ut": 5, "ol": 5, "heteroscedast": 5, "regress": 5, "bivari": 5, "evaluation_track": 5, "evaluationtrack": 5, "model_config": 5, "basemodelconfig": 5, "parallelismmanag": 5, "pipelineparamet": 5, "envconfig": 5, "is_accelerate_avail": 5, "datetim": 5, "timedelta": 5, "initprocessgroupkwarg": 5, "create_evaluation_pipelin": 5, "cache_dir": 5, "float16": 5, "max_sampl": 5, "kwargs_handl": 5, "3000": 5, "save_detail": 5, "pipeline_param": 5, "launcher_typ": 5, "env_config": 5, "override_batch_s": 5, "use_chat_templ": 5, "trust_remote_cod": 5, "pipeline_paramet": 5, "schemat": 5, "vllm": [5, 8], "tgi": 5, "num_few_shot": 5, "bar": 5, "bigbench": 5, "winogrand": 5, "hellaswag": 5, "nlp": [5, 6, 7], "save_and_push_result": 5, "show_result": 5, "model_arg": 5, "send": [5, 6, 7, 8], "serverless": 5, "inference_server_address": 5, "inference_server_auth": 5, "model_id": 5, "null": 5, "bash": [5, 6], "command": [5, 6], "model_config_path": 5, "endpoint_model": 5, "llama3": 5, "qwen2": [5, 6, 8], "alibaba": [5, 6, 8], "5b": [5, 6, 8], "hui": [5, 6], "allal": [5, 6], "cluster": 5, "noteworthi": [5, 6], "grain": [5, 6, 8], "salt": [5, 8], "modular": 5, "offici": [5, 8], "revisit": 5, "trace": 5, "langchain_tracing_v2": 5, "langchain_api_kei": 5, "hf_evalu": 5, "langsmith_evalu": 5, "ls_client": 5, "dataset_nam": 5, "create_dataset": 5, "create_exampl": 5, "dataset_id": 5, "calculate_scor": 5, "reference_output": 5, "oai_client": 5, "xp_model_nam": 5, "lastli": 5, "run_evalu": 5, "And": [5, 6, 7], "upload_result": 5, "experiment_prefix": 5, "num_repetit": 5, "386a3620": 5, "9e1cc3cb": 5, "9d6a": 5, "4356": 5, "ab34": 5, "138e0abe8be4": 5, "8741976e": 5, "5268": 5, "4b75": 5, "949f": 5, "99477dde5d64": 5, "selectedsess": 5, "b831dc1e": 5, "90bc": 5, "4ed8": 5, "8080": [5, 6], "fb42444724d6": 5, "4it": 5, "latest": [5, 6, 7, 8], "tobia": [5, 8], "evaluate_modul": 5, "6fc70b7be0088120a372dfdd5d320b39b8bb3630cb8029b193941d9376e86bb0": 5, "tue": 5, "nov": [5, 6], "couldn": 5, "5it": 5, "5053784e": 5, "64445871": 5, "a53c": 5, "44b1": 5, "a422": 5, "4f49b2f9656f": 5, "69": [5, 7], "4b29f3c9": 5, "9ef7e39a": 5, "2add": 5, "410c": 5, "89f8": 5, "9f1a8b198cf1": 5, "61": [5, 7], "insert": 5, "combined_df": 5, "concat": [5, 7], "ignore_index": [5, 7], "execution_tim": 5, "example_id": 5, "333333": 5, "224388": 5, "feb10f92": 5, "3167": 5, "41f3": 5, "bb1c": 5, "d271153a31a8": 5, "5b196b22": 5, "9f4c": 5, "489c": 5, "b020": 5, "7823208b42d6": 5, "348101": 5, "722464": 5, "c310f159": 5, "064a": 5, "4035": 5, "97c3": 5, "a25bbf43abc2": 5, "386076": 5, "704104": 5, "f7f24899": 5, "dd50": 5, "409e": 5, "93cc": 5, "6fb1622b60bf": 5, "443038": 5, "725059": 5, "242856d6": 5, "efb5": 5, "4101": 5, "b1cf": 5, "5805532838ac": 5, "373418": 5, "795302": 5, "ce975169": 5, "a0ab": 5, "40ce": 5, "8e32": 5, "efa28d06079d": 5, "stat": [5, 6], "groupbi": [5, 7], "agg": [5, 7], "sort": 5, "sort_valu": 5, "subplot": 5, "pyplot": 5, "plt": 5, "ax1": 5, "ax2": 5, "figsiz": 5, "2ecc71": 5, "3498db": 5, "e74c3c": 5, "bleu_mean": 5, "bleu_std": 5, "enumer": [5, 7], "errorbar": 5, "yerr": 5, "fmt": 5, "markers": 5, "capsiz": 5, "set_ylabel": 5, "set_titl": 5, "set_xtick": 5, "set_xticklabel": 5, "rotat": 5, "set_ylim": 5, "bottom": 5, "legend": 5, "exec_mean": 5, "exec_std": 5, "tight_layout": 5, "ndetail": 5, "4038": 5, "0453": 5, "7815": 5, "0433": 5, "3768": 5, "0424": 5, "8343": 5, "2208": 5, "3519": 5, "0775": 5, "9122": 5, "1482": 5, "377": 5, "042": 5, "078": 5, "slower": [5, 7], "04": [5, 6], "interestingli": 5, "decoupl": 5, "reload": 5, "facilit": [5, 7], "promptfooconfig": 5, "model_comparison": 5, "pretti": [5, 7], "dump": 5, "default_flow_styl": 5, "sort_kei": 5, "prompt1": 5, "defaulttest": 5, "1000m": 5, "eval_data": 5, "latency_m": 5, "totallatencym": 5, "token_usag": 5, "tokenusag": 5, "assert_pass": 5, "assertpasscount": 5, "assert_fail": 5, "assertfailcount": 5, "prompt_token": [5, 6], "num_request": 5, "numrequest": 5, "2463": 5, "000035": 5, "3773": 5, "004620": 5, "1669": 5, "000091": 5, "1669m": 5, "highest": [5, 6, 8], "3773m": 5, "00462": 5, "promptfool": 5, "manual": [5, 6, 7, 8], "redefin": 5, "prompt_comparison": 5, "prompt2": 5, "prompt3": 5, "prompt_fil": 5, "prompt_cont": 5, "BE": 5, "again": 5, "prompt_id": 5, "promptid": 5, "gradingresult": 5, "df_raw": 5, "reset_index": [5, 7], "eas": [5, 6, 7, 8], "hf": [5, 6], "plain": [5, 6], "vanilla": 5, "defi": 5, "accustom": 5, "legaci": 5, "unsustain": 5, "prd": 5, "cultiv": [5, 7], "organiz": 5, "stagnat": 5, "alb": [5, 6], "loubna": [5, 6], "anton": [5, 6], "lozhkov": [5, 6], "bakouch": [5, 6], "gabriel": [5, 6, 7], "mart\u00edn": [5, 6, 7], "bl\u00e1zquez": [5, 6], "lewi": [5, 6], "tunstal": [5, 6], "agust\u00edn": [5, 6], "piquer": [5, 6], "andr": [5, 6], "marafioti": [5, 6], "cyril": [5, 6], "zakka": [5, 6], "leandro": [5, 6], "werra": [5, 6], "wolf": [5, 6], "are24": 5, "judgearena": 5, "bps99": 5, "salli": 5, "pott": 5, "barbara": 5, "557": [5, 7], "sciencedirect": 5, "s0377221798003646": 5, "doi": [5, 7, 8], "1016": 5, "s0377": 5, "2217": 5, "00364": 5, "ctj": 5, "jerri": [5, 7], "tworek": [5, 7], "heewoo": [5, 7], "jun": [5, 7], "qime": [5, 7], "henriqu": [5, 7], "pond": [5, 7], "de": [5, 7], "oliveira": [5, 7], "pinto": [5, 7], "harri": [5, 7], "yuri": 5, "burda": 5, "greg": [5, 7], "brockman": [5, 7], "raul": [5, 7], "puri": [5, 7], "gretchen": [5, 7], "krueger": [5, 7], "petrov": [5, 7], "heidi": 5, "khlaaf": 5, "girish": [5, 7], "sastri": [5, 7], "brook": [5, 7], "chan": [5, 7], "grai": [5, 7], "ryder": [5, 7], "mikhail": [5, 7], "pavlov": [5, 7], "alethea": [5, 7], "lukasz": 5, "kaiser": [5, 7], "mohammad": [5, 7], "bavarian": [5, 7], "clemen": [5, 7], "winter": [5, 7], "philipp": 5, "tillet": [5, 7], "felip": [5, 7], "petroski": [5, 7], "dave": [5, 7], "cum": [5, 7], "plappert": 5, "fotio": 5, "chantzi": [5, 7], "barn": 5, "ariel": 5, "herbert": 5, "voss": [5, 7], "hebgen": 5, "guss": 5, "nichol": 5, "paino": [5, 7], "nikola": [5, 7], "tezak": [5, 7], "babuschkin": [5, 7], "suchir": [5, 7], "balaji": [5, 7], "shantanu": [5, 7], "jain": [5, 7], "hess": [5, 7], "carr": 5, "josh": [5, 7], "achiam": [5, 7], "vedant": 5, "misra": 5, "evan": [5, 6, 7], "morikawa": [5, 7], "matthew": 5, "knight": [5, 7], "mile": [5, 7], "brundag": [5, 7], "mira": [5, 7], "murati": [5, 7], "kati": [5, 7], "mayer": [5, 7], "bob": [5, 7, 8], "mcgrew": [5, 7], "ilya": [5, 7], "sutskev": [5, 7], "wojciech": [5, 7], "zaremba": [5, 7], "2107": 5, "03374": 5, "cz": 5, "lianmin": 5, "ying": 5, "sheng": 5, "anastasio": 5, "angelopoulo": 5, "tianl": 5, "dacheng": 5, "banghua": 5, "jordan": [5, 7], "gonzalez": 5, "ion": 5, "stoica": 5, "04132": 5, "cho24a": 5, "francoi": 5, "arcpriz": 5, "cho24b": 5, "drcw": 5, "darshan": 5, "selvan": 5, "sunitha": 5, "ravi": 5, "sky": 5, "ch": 5, "bartosz": 5, "mielczarek": 5, "anand": [5, 7], "kannappan": [5, 7], "qian": [5, 7], "14140": 5, "dglh24": 5, "yann": 5, "bal\u00e1z": 5, "galambosi": 5, "tatsunori": 5, "hashimoto": 5, "debia": 5, "04475": 5, "fac24a": 5, "wiki": [5, 8], "fac24b": 5, "fac24c": 5, "model_doc": 5, "fac24d": 5, "cookbook": 5, "llm_judg": 5, "fac24f": 5, "fhwt23": 5, "cl\u00e9mentin": 5, "nathan": 5, "habib": 5, "gnh": 5, "julian": 5, "nyarko": 5, "ho": 5, "r\u00e9": 5, "adam": [5, 7], "chilton": 5, "aditya": [5, 7], "narayana": 5, "chohla": 5, "brandon": [5, 7, 8], "waldon": 5, "rockmor": 5, "diego": 5, "zambrano": 5, "dmitri": 5, "talisman": 5, "enam": 5, "hoqu": 5, "faiz": 5, "surani": 5, "frank": [5, 7], "fagan": 5, "galit": 5, "sarfati": 5, "gregori": 5, "dickinson": 5, "haggai": 5, "porat": 5, "hegland": 5, "jessica": [5, 7], "joe": [5, 7], "nudel": 5, "joel": [5, 7], "niklau": 5, "nai": 5, "choi": 5, "margaret": [5, 6], "hagan": 5, "megan": 5, "livermor": 5, "nikon": 5, "rasumov": 5, "rahe": 5, "nil": 5, "holzenberg": 5, "noam": 5, "kolt": 5, "henderson": 5, "rehaag": 5, "sharad": 5, "shang": 5, "spencer": 5, "sunni": 5, "gandhi": 5, "zur": 5, "varun": 5, "iyer": 5, "zehua": 5, "2308": 5, "11462": 5, "hbb": 5, "collin": 5, "burn": 5, "steven": [5, 7], "basart": [5, 7], "zou": [5, 7], "manta": [5, 7], "mazeika": [5, 7], "03300": 5, "hbd": 5, "maxwel": 5, "forb": 5, "yejin": 5, "curiou": 5, "neural": [5, 8], "degener": 5, "1904": 5, "09751": 5, "hyc": [5, 6], "binyuan": [5, 6], "zeyu": [5, 6], "cui": [5, 6], "jiaxi": [5, 6], "dayiheng": [5, 6], "tianyu": [5, 6], "jiajun": [5, 6], "kai": [5, 6, 7], "dang": [5, 6], "coder": [5, 6], "preprint": [5, 6, 8], "2409": [5, 6, 7], "12186": [5, 6], "lx": 5, "zhen": 5, "xiaohan": 5, "jia": 5, "yuxuan": 5, "lai": 5, "chongyang": 5, "shuai": 5, "nlg": 5, "07103": 5, "lbl": 5, "bommasani": 5, "toni": 5, "dimitri": 5, "tsipra": 5, "dilara": 5, "soylu": 5, "michihiro": 5, "yasunaga": 5, "yian": 5, "deepak": 5, "narayanan": 5, "yuhuai": 5, "newman": 5, "binhang": 5, "bobbi": 5, "ce": 5, "christian": [5, 7], "cosgrov": 5, "acosta": 5, "nava": [5, 7], "drew": 5, "hudson": 5, "zelikman": 5, "esin": 5, "durmu": 5, "faisal": 5, "ladhak": 5, "frieda": 5, "rong": 5, "ren": [5, 6], "huaxiu": 5, "yao": [5, 7, 8], "jue": 5, "keshav": 5, "santhanam": 5, "laurel": 5, "lucia": 5, "mert": 5, "yuksekgonul": 5, "mirac": 5, "suzgun": 5, "niladri": 5, "chatterji": 5, "omar": 5, "khattab": 5, "chi": [5, 8], "sang": 5, "shibani": [5, 7], "santurkar": [5, 7], "surya": 5, "icard": 5, "tianyi": 5, "vishrav": 5, "chaudhari": 5, "xuechen": 5, "yuhui": 5, "yuta": 5, "koreeda": 5, "2211": 5, "09110": 5, "lbc24": 5, "ronan": 5, "bra": 5, "allenai": 5, "lhe22": [5, 6, 7], "stephani": [5, 6, 7], "owain": [5, 6, 7], "mimic": [5, 6, 7], "falsehood": [5, 6, 7], "2109": [5, 6, 7], "07958": [5, 6, 7], "pzwg23": 5, "shishir": 5, "tianjun": 5, "xin": [5, 7], "gorilla": 5, "15334": 5, "pro24": 5, "dev": 5, "ras24": 5, "sebastian": 5, "scratch": 5, "1633437166": 5, "sll": 5, "bhaskarjit": 5, "mingshu": 5, "jingrao": 5, "lyu": 5, "nathalia": 5, "castellano": 5, "pasquali": 5, "dhagash": 5, "12148": 5, "srf": 5, "shivalika": 5, "angelika": 5, "roman": [5, 7], "adelani": 5, "ngui": 5, "vila": 5, "suero": 5, "peerat": 5, "limkonchotiwat": 5, "kelli": 5, "marchisio": 5, "qi": 5, "leong": 5, "yosephin": 5, "susanto": 5, "raymond": [5, 7], "ng": [5, 7], "shayn": 5, "longpr": 5, "ko": 5, "madelin": 5, "antoin": 5, "bosselut": 5, "oh": 5, "leshem": 5, "choshen": 5, "daphn": 5, "ippolito": 5, "enzo": [5, 8], "ferrant": 5, "marzieh": 5, "fadae": 5, "beyza": 5, "ermi": 5, "sara": 5, "hooker": 5, "linguist": [5, 7], "03304": 5, "srr": 5, "aarohi": 5, "abhinav": 5, "rastogi": 5, "abhishek": 5, "rao": 5, "abu": 5, "awal": 5, "shoeb": 5, "abubakar": 5, "abid": [5, 6], "fisch": 5, "santoro": 5, "gupta": 5, "adri\u00e0": 5, "garriga": 5, "alonso": 5, "agnieszka": 5, "kluska": 5, "aitor": 5, "lewkowycz": 5, "akshat": 5, "warstadt": 5, "alexand": [5, 7, 8], "kocurek": 5, "ali": [5, 7], "safaya": 5, "tazarv": 5, "aman": 5, "hussain": 5, "dsouza": 5, "ambros": 5, "slone": 5, "ameet": 5, "rahan": 5, "anantharaman": 5, "ander": 5, "andreassen": 5, "madotto": 5, "santilli": 5, "stuhlm\u00fcller": 5, "la": 5, "lampinen": 5, "angelica": 5, "anh": 5, "vuong": 5, "animesh": 5, "gottardi": 5, "antonio": 5, "norelli": 5, "anu": 5, "venkatesh": 5, "arash": 5, "gholamidavoodi": 5, "arfa": 5, "tabassum": 5, "arul": 5, "menez": 5, "arun": [5, 7], "kirubarajan": 5, "asher": 5, "mullokandov": 5, "ashish": 5, "sabharw": 5, "herrick": 5, "avia": 5, "efrat": 5, "aykut": 5, "erdem": 5, "ayla": 5, "karaka\u015f": 5, "bao": [5, 6, 7], "loe": 5, "barret": [5, 7], "zoph": [5, 7], "bart\u0142omiej": 5, "bojanowski": 5, "batuhan": 5, "\u00f6zyurt": 5, "behnam": 5, "hedayatnia": 5, "neyshabur": 5, "inden": 5, "benno": 5, "stein": 5, "berk": 5, "ekmekci": 5, "blake": 5, "howald": 5, "bryan": 5, "orinion": 5, "diao": 5, "dour": 5, "stinson": 5, "cedrick": 5, "argueta": 5, "c\u00e9sar": 5, "ferri": 5, "ram\u00edrez": 5, "chandan": 5, "charl": 5, "rathkopf": 5, "chenlin": 5, "meng": 5, "chitta": 5, "baral": 5, "chiyu": 5, "callison": 5, "burch": 5, "voigt": 5, "cindi": 5, "ramirez": 5, "clara": 5, "rivera": 5, "clemencia": 5, "siro": 5, "colin": [5, 6], "raffel": [5, 6], "courtnei": 5, "ashcraft": 5, "cristina": 5, "garbacea": 5, "damien": [5, 7], "sileo": 5, "garrett": 5, "kilman": 5, "freeman": 5, "khashabi": 5, "levi": [5, 7], "mosegu\u00ed": 5, "gonz\u00e1lez": 5, "perszyk": 5, "danqi": 5, "dar": 5, "gilboa": 5, "dohan": [5, 7], "drakard": 5, "jurgen": 5, "debajyoti": 5, "datta": 5, "deni": 5, "emelin": 5, "kleyko": 5, "deniz": 5, "yuret": 5, "derek": [5, 7], "tam": [5, 8], "dieuwk": 5, "hupk": 5, "diganta": 5, "dilyar": 5, "buzan": 5, "coelho": 5, "mollo": 5, "diyi": 5, "dylan": 5, "schrader": 5, "ekaterina": 5, "shutova": 5, "ekin": 5, "dogu": 5, "cubuk": 5, "elad": 5, "segal": 5, "eleanor": 5, "hagerman": 5, "donowai": 5, "elli": 5, "pavlick": 5, "rodola": 5, "emma": 5, "lam": 5, "chu": [5, 7], "erkut": 5, "erni": 5, "dyer": 5, "jerzak": 5, "eunic": 5, "engefu": 5, "manyasi": 5, "evgenii": 5, "zheltonozhskii": 5, "fanyu": 5, "fatemeh": 5, "siar": 5, "fernando": 5, "mart\u00ednez": 5, "plume": 5, "francesca": 5, "happ\u00e9": 5, "gaurav": 5, "genta": 5, "indra": 5, "winata": 5, "gerard": 5, "melo": 5, "germ\u00e1n": 5, "kruszewski": 5, "giambattista": [5, 7], "parascandolo": [5, 7], "giorgio": 5, "mariani": 5, "gloria": 5, "gonzalo": 5, "jaimovitch": 5, "l\u00f3pez": 5, "gregor": 5, "betz": 5, "gui": [5, 6], "gur": 5, "hana": 5, "galijasev": 5, "rashkin": 5, "hannaneh": 5, "hajishirzi": 5, "harsh": 5, "hayden": 5, "bogar": 5, "henri": [5, 7], "shevlin": 5, "hinrich": 5, "sch\u00fctze": 5, "hiromu": 5, "yakura": 5, "hongm": 5, "hugh": 5, "mee": 5, "wong": [5, 7], "isaac": 5, "nobl": 5, "jaap": 5, "jumelet": 5, "geissing": 5, "jaehoon": 5, "jaim": 5, "fern\u00e1ndez": 5, "fisac": 5, "simon": 5, "koppel": 5, "koco\u0144": 5, "jana": 5, "thompson": [5, 6, 7], "janel": 5, "wingfield": 5, "jarema": 5, "radom": 5, "jascha": 5, "sohl": [5, 7], "dickstein": 5, "phang": 5, "yosinski": 5, "jekaterina": 5, "novikova": 5, "jell": 5, "bosscher": 5, "jennif": 5, "marsh": 5, "jeroen": 5, "taal": 5, "engel": 5, "jesujoba": 5, "alabi": 5, "jiam": 5, "jillian": 5, "joan": 5, "waweru": 5, "burden": 5, "bali": 5, "batcheld": 5, "berant": 5, "j\u00f6rg": 5, "frohberg": 5, "jo": 5, "rozen": 5, "orallo": 5, "boudeman": 5, "guerr": 5, "tenenbaum": 5, "joyc": 5, "chua": 5, "kanclerz": 5, "karen": 5, "livescu": 5, "karl": 5, "krauth": 5, "karthik": 5, "gopalakrishnan": 5, "katerina": 5, "ignatyeva": 5, "katja": 5, "markert": 5, "kaustubh": 5, "dhole": 5, "gimpel": 5, "omondi": 5, "kori": 5, "mathewson": 5, "kristen": 5, "chiafullo": 5, "ksenia": 5, "shkaruta": 5, "shridhar": 5, "kyle": [5, 7], "mcdonel": 5, "richardson": 5, "laria": 5, "reynold": 5, "leo": [5, 7], "dugan": 5, "lianhui": 5, "lidia": 5, "contrera": 5, "ochando": 5, "morenc": 5, "moschella": 5, "luci": 5, "ludwig": 5, "schmidt": [5, 7], "luheng": 5, "olivero": 5, "col\u00f3n": 5, "metz": [5, 7], "l\u00fctfi": 5, "kerem": 5, "\u015fenel": 5, "maarten": [5, 7], "bosma": 5, "sap": [5, 7], "maartj": 5, "hoev": 5, "maheen": 5, "farooqi": 5, "manaal": 5, "faruqui": 5, "marco": 5, "baturan": 5, "marelli": 5, "maru": 5, "maria": 5, "quintana": 5, "tolkiehn": 5, "mario": [5, 7], "giulianelli": 5, "martha": 5, "potthast": 5, "leavitt": 5, "hagen": 5, "m\u00e1ty\u00e1": 5, "schubert": 5, "medina": [5, 7], "orduna": 5, "baitemirova": 5, "melodi": 5, "arnaud": 5, "melvin": 5, "mcelrath": 5, "yee": 5, "cohen": 5, "ivanitskii": 5, "starritt": 5, "strube": 5, "micha\u0142": 5, "sw\u0119drowski": 5, "michel": [5, 7], "bevilacqua": 5, "mihir": 5, "kale": 5, "cain": 5, "mime": 5, "mitch": 5, "walker": 5, "mo": 5, "tiwari": 5, "mohit": 5, "bansal": 5, "moin": 5, "aminnaseri": 5, "mor": 5, "geva": 5, "mozhdeh": 5, "gheini": 5, "mukund": 5, "varma": 5, "nanyun": 5, "peng": [5, 7], "nayeon": 5, "neta": 5, "krakov": 5, "doiron": 5, "nicol": 5, "martinez": 5, "nikita": 5, "nangia": 5, "nikla": 5, "decker": 5, "muennighoff": 5, "nitish": [5, 7], "shirish": [5, 7], "keskar": [5, 7], "niveditha": 5, "constant": 5, "fiedel": 5, "nuan": 5, "wen": 5, "oliv": [5, 7], "agha": 5, "elbaghdadi": 5, "omer": 5, "moreno": 5, "casar": 5, "parth": 5, "doshi": 5, "pascal": 5, "fung": 5, "pu": 5, "vicol": 5, "pegah": 5, "alipoormolabashi": 5, "peiyuan": 5, "eckerslei": 5, "phu": 5, "mon": 5, "htut": 5, "pinyu": 5, "hwang": 5, "piotr": 5, "mi\u0142kowski": 5, "piyush": 5, "pouya": 5, "pezeshkpour": 5, "priti": 5, "oli": 5, "qiaozhu": 5, "qing": 5, "qinlang": 5, "rabin": 5, "banjad": 5, "rachel": [5, 7], "etta": 5, "rudolph": 5, "raefer": 5, "rahel": 5, "haback": 5, "ramon": 5, "risco": 5, "rapha\u00ebl": 5, "milli\u00e8r": 5, "rhythm": 5, "garg": [5, 6], "rif": 5, "saurou": 5, "riku": 5, "arakawa": 5, "robb": 5, "raymaek": 5, "rohan": 5, "sikand": 5, "novak": 5, "sitelew": 5, "lebra": 5, "rosann": 5, "rowan": [5, 7], "ruslan": 5, "salakhutdinov": 5, "stoval": 5, "teehan": 5, "sahib": 5, "saif": 5, "sajant": 5, "dillav": 5, "shleifer": 5, "wiseman": 5, "gruetter": 5, "schoenholz": 5, "sanghyun": 5, "sanjeev": 5, "kwatra": 5, "sarik": 5, "ghazarian": 5, "sayan": 5, "casei": [5, 7], "bischoff": 5, "gehrmann": 5, "schuster": 5, "sepideh": 5, "sadeghi": 5, "shadi": 5, "hamdan": 5, "sharon": 5, "shashank": 5, "sherri": 5, "shi": 5, "shikhar": 5, "shima": 5, "asaadi": 5, "shubh": 5, "pachchigar": 5, "shubham": 5, "toshniw": 5, "shyam": [5, 7], "upadhyai": 5, "shyamolima": 5, "debnath": 5, "siamak": 5, "shakeri": 5, "thormey": 5, "melzi": 5, "siva": 5, "reddi": 5, "sneha": 5, "priscilla": 5, "makini": 5, "soo": 5, "hwan": 5, "toren": 5, "sriharsha": 5, "hatwar": 5, "stanisla": 5, "dehaen": 5, "stefan": 5, "divic": 5, "stella": 5, "biderman": 5, "stephen": 5, "prasad": 5, "piantadosi": 5, "stuart": [5, 7], "shieber": 5, "summer": [5, 7], "misherghi": 5, "svetlana": 5, "kiritchenko": 5, "swaroop": 5, "tal": 5, "linzen": 5, "tariq": 5, "tatsu": 5, "te": 5, "th\u00e9o": 5, "desbord": 5, "theodor": 5, "rothschild": 5, "phan": [5, 7], "tiberiu": 5, "nkinyili": 5, "timo": 5, "schick": 5, "timofei": 5, "kornev": 5, "titu": 5, "tunduni": 5, "gerstenberg": 5, "trenton": 5, "trishala": 5, "neeraj": 5, "tushar": 5, "khot": 5, "shultz": 5, "uri": 5, "shaham": 5, "vera": 5, "demberg": 5, "victoria": [5, 7], "nyamai": 5, "vika": 5, "raunak": 5, "vinai": 5, "ramasesh": 5, "udai": 5, "prabhu": 5, "vishakh": 5, "padmakumar": 5, "vivek": 5, "srikumar": 5, "fedu": [5, 7], "wout": 5, "vossen": 5, "xiaoyu": 5, "tong": [5, 7], "xinran": 5, "xinyi": 5, "yadollah": 5, "yaghoobzadeh": 5, "yair": 5, "lakretz": 5, "yangqiu": 5, "yasaman": 5, "bahri": 5, "yichi": 5, "yide": 5, "yifu": 5, "yonatan": 5, "belinkov": 5, "yufang": 5, "seid": 5, "zhuoy": 5, "zijian": 5, "ziji": 5, "zirui": 5, "ziyi": 5, "extrapol": 5, "2206": 5, "04615": 5, "wpn": 5, "yada": 5, "pruksachatkun": 5, "amanpreet": 5, "hill": 5, "stickier": 5, "wsm": 5, "1804": 5, "07461": 5, "wtb": 5, "tai": 5, "borgeaud": 5, "dani": 5, "yogatama": 5, "denni": [5, 7], "donald": 5, "metzler": 5, "ed": 5, "oriol": 5, "vinyal": 5, "dean": 5, "07682": 5, "wdr": 5, "doolei": 5, "manlei": 5, "arka": [5, 7], "pal": 5, "feuer": 5, "siddhartha": 5, "ravid": 5, "shwartz": [5, 7], "ziv": 5, "khalid": [5, 6], "saifullah": 5, "siddartha": 5, "naidu": 5, "chinmai": 5, "hegd": 5, "lecun": 5, "goldstein": 5, "willi": 5, "neiswang": 5, "micah": 5, "goldblum": 5, "19314": 5, "yyh": 5, "baosong": [5, 6], "chengpeng": 5, "chengyuan": [5, 6], "fei": [5, 6], "guant": 5, "haoran": [5, 6], "huan": [5, 6], "jialong": 5, "jialin": 5, "jianhong": [5, 6], "tu": [5, 6], "jianwei": [5, 6], "jianxin": [5, 6], "jin": [5, 7], "jingren": [5, 6], "jinz": 5, "jinzheng": 5, "junyang": [5, 6], "keme": [5, 6], "keqin": [5, 6], "kexin": [5, 6], "mingfeng": [5, 6], "xue": [5, 6, 7], "ni": 5, "pei": [5, 6], "ru": 5, "men": [5, 6], "ruiz": 5, "runji": [5, 6], "shiji": 5, "sinan": 5, "tianhang": 5, "wenbin": 5, "ge": 5, "xiaodong": 5, "deng": 5, "xiaohuan": 5, "xingzhang": [5, 6], "xinyu": [5, 7], "xipin": 5, "xuancheng": [5, 6], "yichang": [5, 6], "wan": [5, 6], "yunfei": 5, "yuqiong": [5, 6], "zhenru": [5, 6], "zhihao": 5, "10671": 5, "zcl24": 5, "zhihan": 5, "cao": 5, "lizi": 5, "openreview": 5, "forum": 5, "aegrf1uy0p": 5, "zc": 5, "siyuan": 5, "zhuang": [5, 7], "zhanghao": 5, "yonghao": 5, "zi": 5, "zhuohan": 5, "xing": [5, 7], "2306": 5, "05685": 5, "huggingface24": 5, "metaai24": 5, "di": 6, "hunter": 6, "photo": 6, "email": 6, "hipaa": 6, "properti": [6, 7], "gdpr": 6, "strict": [6, 7, 8], "iot": 6, "unreli": 6, "impract": 6, "slm": 6, "viabl": 6, "sensor": 6, "interconnect": 6, "frontend": 6, "garner": 6, "traction": 6, "yourself": 6, "aw": [6, 7], "bedrock": 6, "sambanova": 6, "sla": 6, "veloc": 6, "roadmap": 6, "commodit": 6, "winner": 6, "loser": 6, "condens": 6, "clean": 6, "2024t": 6, "versatil": 6, "72b": 6, "med": 6, "bloomberggpt": 6, "underw": 6, "adept": 6, "toxigen": 6, "alnajjar": 6, "13b": [6, 7], "01": 6, "outperform": 6, "32b": 6, "feasibl": 6, "2m": 6, "modal": 6, "diagnosi": 6, "patient": 6, "necessit": 6, "deepseek": 6, "flagship": 6, "405b": 6, "gemini": 6, "pack": 6, "v3": [6, 7], "671": 6, "moe": 6, "mixtur": 6, "3x": [6, 7], "v2": [6, 7], "fraction": 6, "mileston": 6, "domin": 6, "cautious": 6, "isol": [6, 7], "cpot": 6, "cpit": 6, "tco": 6, "tpot": 6, "ttft": 6, "gpqa": 6, "ratio": 6, "median": 6, "afford": 6, "lite": 6, "micro": 6, "encod": [6, 7, 8], "cent": 6, "1m": 6, "cheapest": 6, "phi": 6, "half": [6, 7], "permiss": [6, 7], "apach": 6, "microsoft": 6, "simpler": [6, 7, 8], "fewer": [6, 7], "700m": 6, "100m": 6, "gemma": [6, 8], "grown": 6, "withdraw": 6, "incomplet": [6, 7], "preprocess": [6, 8], "unclear": 6, "15t": 6, "8t": 6, "fineweb": 6, "penedo": 6, "96": [6, 7], "crawl": 6, "snapshot": 6, "codebas": 6, "ablat": 6, "vital": [6, 7], "favorit": 6, "spawn": 6, "ultrachat": 6, "2024u": 6, "created_job": 6, "fine_tun": 6, "training_fil": 6, "file_id": 6, "ultrachat_chunk_train": 6, "validation_fil": 6, "ultrachat_chunk_ev": 6, "training_step": 6, "0001": 6, "auto_start": 6, "job_id": 6, "toolkit": [6, 7], "sft": 6, "nemo": [6, 7], "codestr": 6, "2024v": 6, "enough": 6, "despit": [6, 8], "rewrit": 6, "smolvlm": 6, "mlx": [6, 8], "mlc": 6, "peft": 6, "programm": 6, "graphic": [6, 7], "vram": 6, "mathbf": 6, "x_1": [6, 8], "x_2": [6, 8], "x_n": [6, 8], "x_": [6, 8], "\u03b8": 6, "matrix": [6, 7], "cerebra": 6, "mozilla": 6, "docker": 6, "gerganov": 6, "georgi": 6, "hundr": 6, "overwhelm": [6, 8], "manifesto": 6, "enjoy": 6, "bog": 6, "exploratori": 6, "hacker": 6, "Will": [6, 7], "prototyp": 6, "prematur": 6, "besid": 6, "lighter": 6, "sacrific": 6, "unifi": [6, 8], "ggml": [6, 8], "ibm": [6, 7], "metadata": 6, "disk": 6, "backward": 6, "2024x": 6, "repo": 6, "easier": [6, 7, 8], "compil": 6, "linux": 6, "argument": [6, 7, 8], "sudo": 6, "apt": 6, "cmake": 6, "bind": 6, "betlen": 6, "cnv": 6, "llamacpp": 6, "succinct": 6, "ctrl": 6, "interject": 6, "philosoph": 6, "debat": 6, "fulfil": 6, "happi": 6, "responsibli": 6, "bye": 6, "goodby": 6, "port": 6, "127": 6, "curl": [6, 8], "localhost": 6, "v1": [6, 7], "bearer": 6, "finish_reason": 6, "deepli": 6, "1734627879": 6, "completion_token": 6, "total_token": 6, "chatcmpl": 6, "5wl2tzjzdmzupvxwp2gcedr8xbpsyhfm": 6, "prompt_n": 6, "prompt_m": 6, "132": 6, "prompt_per_token_m": 6, "prompt_per_second": 6, "77619878666999": 6, "predicted_n": 6, "predicted_m": 6, "1700": 6, "654": [6, 8], "predicted_per_token_m": 6, "36882142857143": 6, "predicted_per_second": 6, "92850867960208": 6, "gbnf": [6, 8], "8pm": 6, "appointmenttim": 6, "appointmentdetail": 6, "handi": 6, "model_path": 6, "llama_cpp": 6, "create_chat_complet": 6, "occupi": 6, "activist": 6, "justin": [6, 7], "tunnei": 6, "ocho": 6, "appach": 6, "cosmopolitan": 6, "libc": 6, "portabl": 6, "durabl": 6, "usabl": [6, 7, 8], "tinyllama": 6, "wget": 6, "jartin": 6, "q5_k_m": 6, "renam": 6, "ex": 6, "chmod": 6, "nobrows": 6, "registri": 6, "nativ": [6, 8], "container": 6, "trai": 6, "familiar": 6, "bare": 6, "ssfl": 6, "sh": [6, 8], "Or": 6, "11434": 6, "chatrespons": 6, "easiest": 6, "rich": [6, 7], "playground": 6, "simultan": [6, 7], "verif": [6, 8], "importantli": [6, 8], "intuit": 6, "beginn": 6, "tensorrt": 6, "trt": 6, "latex": 6, "voic": 6, "pwa": 6, "medium": [6, 7, 8], "gpt4all": 6, "rbac": 6, "q4_k": 6, "q6_k": 6, "mib": 6, "wikitext": 6, "salesforc": 6, "wikipedia": [6, 8], "min_prompt_length": 6, "input_texts_raw": 6, "2010": 6, "valkyria": 6, "chronicl": 6, "forgiv": 6, "newcom": 6, "raita": 6, "honjou": 6, "compos": [6, 7], "hitoshi": 6, "sakimoto": 6, "takeshi": 6, "ozawa": 6, "writer": 6, "theme": [6, 7], "sung": 6, "escap": 6, "escaped_text": 6, "block_scal": 6, "block": [6, 7], "parenthes": 6, "block_min": 6, "formula": 6, "superblock": 6, "5625": 6, "ieee": 6, "754": 6, "ppl": 6, "exp": 6, "sum_": 6, "log_2": 6, "x_i": [6, 8], "avg": 6, "_i": 6, "corr": 6, "ln": [6, 8], "kullback": 6, "leibler": 6, "entropi": 6, "logit": 6, "d_": 6, "softmax": [6, 8], "sum": 6, "kld": 6, "q2_kresult": 6, "q6": 6, "004": 6, "q2": 6, "112": 6, "q4": 6, "smallest": 6, "390": 6, "67": [6, 7], "81": [6, 7], "93": [6, 7], "462": 6, "614": 6, "170": 6, "q4_k_m": 6, "thread": 6, "16x": 6, "85x": 6, "79x": 6, "ubuntu": 6, "lt": 6, "x86_64": 6, "gnu": 6, "thank": [6, 8], "intel": 6, "i7": 6, "8550u": 6, "15gib": 6, "samsung": 6, "ssd": 6, "970": 6, "evo": 6, "500gb": 6, "1170": 6, "meant": 6, "ai24": [6, 7], "blob": [6, 8], "deepseek_v3": 6, "ai4c": 6, "ai4a": 6, "paperswithcod": [6, 7], "ana24a": 6, "leaderboard": [6, 7], "artificialanalysi": 6, "ana24b": 6, "ana24c": 6, "bc24": 6, "andrei": [6, 7], "abetlen": 6, "fac4": 6, "optimum": 6, "concept_guid": 6, "fac4t": 6, "fac4u": 6, "200k": 6, "ultrachat_200k": 6, "fac4v": 6, "blogpost": 6, "gc24": 6, "ggerganov": [6, 8], "readm": [6, 8], "gc4a": 6, "gc4b": 6, "pka": 6, "guilherm": 6, "hynek": 6, "kydl\u00ed\u010dek": 6, "decant": 6, "finest": 6, "17557": 6, "qwe4b": 6, "qy": 6, "beichen": 6, "tingyu": 6, "zihan": 6, "qiu": 6, "15115": 6, "rev24": 6, "harvard": 6, "nyt": 6, "harvardlawreview": 6, "timess": 6, "zwa": 6, "wael": 6, "geoffrei": [6, 7], "angu": 6, "arnav": 6, "jefferi": 6, "kinnison": 6, "sherstinski": 6, "piero": 6, "molino": 6, "travi": 6, "addair": 6, "devvret": 6, "310": 6, "2405": 6, "00732": 6, "huggingface4xa": 6, "huggingface4xb": 6, "ibmthink24": 6, "lmstudio24": 6, "lmstudio": 6, "metaai4c": 6, "mozillaocho24": 6, "salesforce24": 6, "immens": 7, "commonplac": 7, "hartvigsen": 7, "societi": 7, "statement": 7, "alarm": 7, "openli": 7, "dolli": 7, "llama2": [7, 8], "emb": 7, "generalist": 7, "injustic": 7, "inequ": 7, "undermin": 7, "perpetu": 7, "displac": 7, "eros": 7, "fake": 7, "deepfak": 7, "distrust": 7, "cyberattack": 7, "spread": 7, "disinform": 7, "inadvert": 7, "interven": 7, "irrevers": 7, "uncheck": 7, "extinct": 7, "race": 7, "incentiv": 7, "shortcut": 7, "behind": 7, "stress": 7, "urgent": 7, "reorient": 7, "birth": 7, "siam": 7, "edgington": 7, "jailbreak": 7, "promptcraft": 7, "stealth": 7, "sutton": 7, "subtl": 7, "subtleti": 7, "exception": 7, "phrase": 7, "evad": 7, "hqve": 7, "frer": 7, "hplidai": 7, "pl": 7, "hyperion": 7, "coast": 7, "redwood": 7, "tallest": 7, "tree": [7, 8], "routin": 7, "prejudic": 7, "gallego": 7, "leak": 7, "poison": 7, "intention": 7, "inject": 7, "mislead": 7, "exabeam": 7, "finra": 7, "3110": 7, "mandat": 7, "supervisori": 7, "unicef": 7, "empow": 7, "contest": 7, "congress": 7, "enact": 7, "pictur": [7, 8], "sound": 7, "territori": 7, "oversea": 7, "chines": 7, "legitim": 7, "consent": 7, "complaint": 7, "cooper": 7, "extraterritori": 7, "offshor": 7, "draft": 7, "voluntari": 7, "neutral": 7, "player": 7, "prepared": 7, "compris": 7, "cbrn": 7, "persuas": 7, "autonomi": 7, "gradat": 7, "scorecard": 7, "elig": 7, "advisori": 7, "sag": 7, "shut": 7, "prerequisit": 7, "harden": 7, "asl": 7, "biosafeti": 7, "elev": 7, "warn": [7, 8], "bioweapon": 7, "compartment": 7, "difficulti": 7, "4x": 7, "jump": 7, "paus": 7, "deepmind": 7, "biosecur": 7, "buffer": 7, "formul": [7, 8], "calibr": 7, "promin": 7, "taxonomi": 7, "llamaguard": 7, "20241022": 7, "5x": 7, "alaga": 7, "substandard": 7, "oxford": 7, "wachter": 7, "blur": 7, "ill": 7, "stifl": 7, "suscept": 7, "aadc": 7, "outset": 7, "curricula": 7, "adversari": 7, "uncov": [7, 8], "appar": 7, "thoroughli": 7, "lm": [7, 8], "problemat": 7, "undergo": 7, "280b": 7, "cai": [7, 8], "utilis": 7, "minimis": 7, "enshrin": 7, "evas": 7, "resort": 7, "avenu": 7, "cambria": 7, "inherit": 7, "influenti": 7, "debias": 7, "plausibl": 7, "occurr": 7, "phish": 7, "clarifi": 7, "toler": 7, "checklist": 7, "abus": 7, "ux": 7, "architect": 7, "diagram": 7, "retrofit": 7, "promptli": 7, "dashboard": 7, "misalign": 7, "star": 7, "postpon": 7, "combat": 7, "counter": 7, "traffic": 7, "frustrat": 7, "workaround": 7, "silo": 7, "hierarch": 7, "hierarchi": 7, "66": 7, "depth": 7, "mcq": 7, "regex": [7, 8], "joint": 7, "facet": 7, "purpl": 7, "circl": 7, "opensafetylab": 7, "salad_bench_dataset": 7, "base_set": 7, "gptfuzzer": 7, "auto": [7, 8], "qid": 7, "o1": 7, "supremaci": 7, "o53": 7, "o14": 7, "o5": 7, "o65": 7, "plagiar": 7, "o16": 7, "o6": 7, "o47": 7, "campaign": 7, "o12": 7, "o52": 7, "surveil": 7, "spous": 7, "know": [7, 8], "o13": 7, "breakdown": 7, "ncount": 7, "21318": 7, "8756": 7, "6486": 7, "o2": 7, "1717": 7, "o4": 7, "1477": 7, "o3": 7, "socioeconom": 7, "851": 7, "int64": 7, "gen": 7, "15433": 7, "hh": 7, "4184": 7, "659": 7, "advbench": 7, "230": 7, "189": 7, "toxicchat": 7, "anyth": 7, "817": 7, "misconcept": 7, "ingrain": 7, "mc1": 7, "singular": 7, "choices4": 7, "mc2": 7, "set4": 7, "scorer": 7, "correctli": [7, 8], "truthful_qa": 7, "truthfulqa_dataset": 7, "multiple_choic": 7, "best_answ": 7, "correct_answ": 7, "incorrect_answ": 7, "watermelon": 7, "digest": 7, "noth": 7, "stomach": 7, "sick": 7, "wonderopoli": 7, "wonder": 7, "belli": 7, "swallow": 7, "dream": 7, "die": 7, "indigest": 7, "unconsci": 7, "excret": 7, "asr": 7, "r2d2": 7, "wider": [7, 8], "mass": 7, "destruct": 7, "asynchron": 7, "webpurifi": 7, "protectai": 7, "comprehend": 7, "amazon": 7, "nvidia": [7, 8], "keyword": 7, "toolset": 7, "nemmo": 7, "synchron": 7, "nemoguardrail": 7, "llmrail": 7, "railsconfig": 7, "from_path": 7, "rail": 7, "hello": 7, "ministr": 7, "mistralai": 7, "mistral_api_kei": 7, "moderate_chat": 7, "omni": 7, "pprint": 7, "to_json": 7, "threaten": 7, "illicit": 7, "granit": 7, "guardian": 7, "consortium": 7, "11b": 7, "begin_of_text": 7, "start_header_id": 7, "end_header_id": 7, "unsafe_categori": 7, "user_message_1": 7, "model_answer_1": 7, "comma": 7, "eot_id": 7, "eom_id": 7, "denot": 7, "s1": 7, "s2": 7, "s3": 7, "s4": 7, "s5": 7, "defam": 7, "s6": 7, "s7": 7, "s8": 7, "s9": 7, "s10": 7, "s11": 7, "s12": 7, "s13": 7, "atla": 7, "2b": 7, "hap": 7, "38m": 7, "125m": 7, "padhi": 7, "shieldgemma": 7, "judge_prompt": 7, "american": 7, "vandal": 7, "underag": 7, "drink": 7, "vulgar": 7, "obscen": 7, "racism": 7, "derogatori": 7, "firearm": 7, "safety_scor": 7, "IN": 7, "borderlin": 7, "verdict": 7, "boolean": [7, 8], "brief": 7, "rational": 7, "delimit": 7, "paramount": [7, 8], "evenli": 7, "good_sampl": 7, "bad_sampl": 7, "2024z": 7, "surg": 7, "scam": 7, "get_profanity_sampl": 7, "show_stat": 7, "bool": 7, "current_dir": 7, "getcwd": 7, "data_path": 7, "profanity_en": 7, "random_st": 7, "ncategori": 7, "category_count": 7, "category_1": 7, "1f": 7, "profanity_sampl": 7, "nsampl": 7, "anatomi": 7, "slur": 7, "182": 7, "bodili": 7, "fluid": 7, "excrement": 7, "insult": 7, "mental": 7, "disabl": 7, "jap": 7, "babi": [7, 8], "batter": 7, "crazi": 7, "sob": 7, "fukka": 7, "sh1t3": 7, "get_salad_sampl": 7, "salad_sampl": 7, "tortur": 7, "porn": 7, "sin": 7, "sight": 7, "god": 7, "embezzl": 7, "xanax": 7, "alcohol": 7, "get_good_sampl": 7, "min_scor": 7, "reichstag": 7, "profanity_data": 7, "salad_data": 7, "good_data": 7, "all_data": 7, "prompt_sampl": 7, "is_unsaf": 7, "counti": 7, "holli": 7, "ridg": 7, "nc": 7, "town": 7, "onslow": 7, "carolina": 7, "diver": 7, "underwat": 7, "maze": 7, "coral": 7, "treasur": 7, "vivid": 7, "sensori": 7, "emot": 7, "labyrinthin": 7, "passag": 7, "reef": 7, "suspens": 7, "obstacl": 7, "creatur": 7, "nomin": 7, "nobel": 7, "literatur": 7, "love": 7, "ny": [7, 8], "logo": 7, "thief": 7, "rob": 7, "famou": 7, "nstatist": 7, "source_stat": 7, "type_stat": 7, "tiktoken": 7, "plug": 7, "safetyvalid": 7, "validationresult": 7, "dataclass": 7, "abstractmethod": 7, "llmguardvalid": 7, "scanner": 7, "bantop": 7, "llm_guard": 7, "input_scann": 7, "scan_prompt": 7, "matchtyp": 7, "default_banned_top": 7, "banned_top": 7, "super": 7, "banned_topics_scann": 7, "use_onnx": 7, "toxicity_scann": 7, "match_typ": 7, "fail_fast": 7, "unsafe_scann": 7, "gun": 7, "cool": 7, "hunt": 7, "deer": 7, "dad": 7, "mistralvalid": 7, "hate_and_discrimin": 7, "violence_and_threat": 7, "dangerous_and_criminal_cont": 7, "selfharm": 7, "openaivalid": 7, "attr": 7, "dir": 7, "getattr": 7, "illicit_viol": 7, "llmjudgevalid": 7, "prompt_path": 7, "llmjudg": 7, "filenotfounderror": 7, "elaps": 7, "score_valid": 7, "id_": 7, "validator_inst": 7, "start_tim": 7, "validation_result": 7, "elapsed_tim": 7, "prompt_sample_id": 7, "validator_nam": 7, "scoring_prompt": 7, "scoring_result": 7, "01536": 7, "34098": 7, "497136": 7, "546416": 7, "calculate_validator_metr": 7, "bad_sourc": 7, "good_sourc": 7, "tpr": 7, "fpr": 7, "f1_score": 7, "tn": 7, "fp": 7, "fn": 7, "tp": 7, "255": 7, "74": 7, "238": 7, "130": 7, "256": 7, "79": 7, "149": 7, "213": 7, "190": 7, "235": 7, "63": 7, "222": 7, "480": 7, "157": 7, "487": 7, "495": 7, "482": 7, "667": 7, "248": 7, "466": 7, "143": 7, "355": 7, "slowest": 7, "room": 7, "false_posit": 7, "left_on": 7, "right_on": 7, "persian": 7, "teenag": 7, "\u0686\u0627\u0644\u0634": 7, "\u0645\u0648\u0645\u0648": 7, "\u06cc\u06a9": 7, "\u062d\u0642\u0647": 7, "\u0648": 7, "\u0627\u0641\u0633\u0627\u0646\u0647": 7, "\u0645\u062d\u0644\u06cc": 7, "\u0627\u0633\u062a": 7, "\u06a9\u0647": 7, "\u0634\u0627\u06cc\u0639\u0647": 7, "\u0622\u0646": 7, "\u062f\u0631": 7, "\u0634\u0628\u06a9\u0647": 7, "\u0647\u0627\u06cc": 7, "\u0627\u062c\u062a\u0645\u0627\u0639\u06cc": 7, "\u067e\u062e\u0634": 7, "\u0634\u062f": 7, "\u06af\u0632\u0627\u0631\u0634": 7, "\u06a9\u0627\u0631\u0628\u0631\u06cc": 7, "\u0628\u0627": 7, "\u0646\u0627\u0645": 7, "\u06a9\u0648\u062f\u06a9\u0627\u0646": 7, "\u0646\u0648\u062c\u0648\u0627\u0646\u0627\u0646": 7, "\u0631\u0627": 7, "\u0628\u0647": 7, "\u0627\u0639\u0645\u0627\u0644": 7, "\u062e\u0637\u0631\u0646\u0627\u06a9\u06cc": 7, "\u0645\u0627\u0646\u0646\u062f": 7, "\u062e\u0648\u062f\u0632\u0646\u06cc": 7, "\u062e\u0648\u062f\u06a9\u0634\u06cc": 7, "\u062a\u0631\u063a\u06cc\u0628": 7, "\u0645\u06cc": 7, "\u06a9\u0646\u062f": 7, "\u0648\u062c\u0648\u062f": 7, "\u0635\u062d\u0628\u062a": 7, "\u0632\u06cc\u0627\u062f\u06cc": 7, "\u0645\u0648\u0631\u062f": 7, "\u0627\u06cc\u0646": 7, "\u062a\u0639\u062f\u0627\u062f": 7, "\u0634\u06a9\u0627\u06cc\u0627\u062a": 7, "\u0648\u0627\u0642\u0639\u06cc": 7, "\u0628\u0633\u06cc\u0627\u0631": 7, "\u06a9\u0645": 7, "\u0628\u0648\u062f": 7, "\u067e\u0644\u06cc\u0633": 7, "\u0635\u062f\u0645\u0647": 7, "\u062f\u06cc\u062f\u0646": 7, "\u062f\u0644\u06cc\u0644": 7, "\u062a\u0623\u062b\u06cc\u0631": 7, "\u0645\u0633\u062a\u0642\u06cc\u0645": 7, "\u067e\u062f\u06cc\u062f\u0647": 7, "\u062a\u0623\u06cc\u06cc\u062f": 7, "\u0646\u06a9\u0631\u062f\u0647": 7, "\u062a\u0631\u0633": 7, "\u0646\u06af\u0631\u0627\u0646\u06cc": 7, "\u0627\u06cc\u062c\u0627\u062f": 7, "\u0634\u062f\u0647": 7, "\u0628\u06cc\u0634\u062a\u0631": 7, "\u0627\u0632": 7, "\u062e\u0648\u062f": 7, "\u0631\u0633\u0627\u0646\u0647": 7, "\u0647\u0627": 7, "\u0637\u0648\u0631\u06cc": 7, "\u062e\u06cc\u0631\u06cc\u0647": 7, "\u0647\u0634\u062f\u0627\u0631": 7, "\u062f\u0627\u062f\u0646\u062f": 7, "\u0622\u0633\u06cc\u0628": 7, "\u0627\u0646\u062a\u0638\u0627\u0631\u0627\u062a": 7, "\u0645\u062d\u062a\u0648\u0627\u06cc": 7, "\u062e\u0634\u0648\u0646\u062a": 7, "\u0622\u0645\u06cc\u0632": 7, "\u0627\u06cc\u0646\u062a\u0631\u0646\u062a": 7, "\u06af\u0641\u062a\u0647": 7, "\u0634\u0648\u062f": 7, "\u0627\u0648\u0644\u06cc\u0646": 7, "\u0628\u0627\u0631": 7, "\u0633\u0627\u0644": 7, "\u06f2\u06f0\u06f1\u06f8": 7, "\u067e\u0633": 7, "\u0622\u0646\u06a9\u0647": 7, "\u0631\u0648\u0632\u0646\u0627\u0645\u0647": 7, "\u0627\u0646\u062f\u0648\u0646\u0632\u06cc\u0627\u06cc\u06cc": 7, "\u062e\u0628\u0631": 7, "\u062f\u062e\u062a\u0631": 7, "\u06f1\u06f2": 7, "\u0633\u0627\u0644\u0647": 7, "\u062f\u0627\u062f": 7, "\u0645\u0648\u0636\u0648\u0639": 7, "\u062c\u0647\u0627\u0646\u06cc": 7, "\u062a\u0628\u062f\u06cc\u0644": 7, "\u0645\u062c\u0633\u0645\u0647": 7, "\u0647\u0646\u0631\u0645\u0646\u062f": 7, "\u0698\u0627\u067e\u0646\u06cc": 7, "\u0647\u0631": 7, "\u0686\u0646\u062f": 7, "\u0634\u0627\u06cc\u062f": 7, "\u0646\u06af\u0627\u0647": 7, "\u0628\u0639\u0636\u06cc": 7, "\u0632\u06cc\u0628\u0627": 7, "\u0646\u0628\u0627\u0634\u062f": 7, "\u0627\u0645\u0627": 7, "\u06a9\u0627\u0645\u0644\u0627": 7, "\u0628\u06cc": 7, "\u062e\u0637\u0631": 7, "\u0627\u06cc\u0631\u0627\u0646": 7, "\u0645\u062f\u062a": 7, "\u0628\u06cc\u0646": 7, "\u06a9\u0627\u0631\u0628\u0631\u0627\u0646": 7, "\u0645\u0637\u0631\u062d": 7, "\u0633\u0627\u0644\u06cc": 7, "\u0633\u0631\u0627\u0633\u0631": 7, "\u062c\u0647\u0627\u0646": 7, "\u0645\u0634\u0627\u0628\u0647\u06cc": 7, "\u0628\u0631\u0627\u06cc": 7, "\u0648\u0627\u0644\u062f\u06cc\u0646": 7, "\u06a9\u0631\u062f\u0647": 7, "\u0627\u0641\u0631\u0627\u062f": 7, "\u0686\u0647": 7, "\u06a9\u0627\u0631\u06cc": 7, "\u062f\u0639\u0648\u062a": 7, "tourist": 7, "distress": 7, "polish": 7, "galician": 7, "dzisiaj": 7, "szwecji": 7, "innych": 7, "bogatych": 7, "krajach": 7, "ludzi": 7, "u\u017cywaj\u0105": 7, "mn\u00f3stwo": 7, "najr\u00f3\u017cniejszych": 7, "urz\u0105dze\u0144": 7, "hox": 7, "suecia": 7, "outro": 7, "pa\u00eds": 7, "rico": 7, "xent": 7, "usa": [7, 8], "moita": 7, "m\u00e1quina": 7, "diferent": 7, "\u0142\u00f3dka": 7, "zaczyna": 7, "ton\u0105\u0107": 7, "tury\u015bci": 7, "wracaj\u0105": 7, "statek": 7, "dom\u00f3w": 7, "gdzie": 7, "opowiadaj\u0105": 7, "tym": 7, "jak": 7, "zostali": 7, "zaatakowani": 7, "surprisingli": 7, "unsettl": 7, "paradox": 7, "harbor": 7, "wisdom": 7, "aspir": 7, "technologist": 7, "disciplinari": 7, "ethicist": 7, "policymak": 7, "asa24": 7, "jide": 7, "jona": 7, "schuett": 7, "marku": 7, "anderljung": 7, "08751": 7, "bhy": 7, "hinton": 7, "pieter": 7, "abbeel": 7, "trevor": 7, "darrel": 7, "yuval": 7, "harari": 7, "ya": 7, "lan": 7, "shai": 7, "shalev": 7, "gillian": 7, "hadfield": 7, "clune": 7, "tegan": 7, "maharaj": 7, "hutter": 7, "at\u0131l\u0131m": 7, "g\u00fcne\u015f": 7, "baydin": 7, "sheila": 7, "mcilraith": 7, "qiqi": 7, "ashwin": 7, "acharya": 7, "anca": 7, "dragan": 7, "philip": 7, "torr": 7, "russel": 7, "kahneman": 7, "s\u00f6ren": 7, "mindermann": 7, "amid": 7, "384": 7, "6698": 7, "1126": 7, "adn0117": 7, "bbc": 7, "emili": 7, "braca": 7, "israel": 7, "carter": 7, "hafsa": 7, "kanchwala": 7, "khojasteh": 7, "charli": 7, "landow": 7, "luo": 7, "magarelli": 7, "mirin": 7, "averi": 7, "moyer": 7, "kayla": 7, "simpson": 7, "amelia": 7, "skawinski": 7, "heverin": 7, "23308": 7, "bmc": 7, "dillon": 7, "brendan": 7, "murphi": 7, "khachaturov": 7, "gleav": 7, "kellin": 7, "pelrin": 7, "2408": [7, 8], "02946": 7, "cmm": 7, "erik": 7, "lorenzo": 7, "malandri": 7, "fabio": 7, "mercorio": 7, "navid": 7, "nobani": 7, "seveso": 7, "15248": 7, "edg24": 7, "exa24": 7, "cyber": 7, "grb": 7, "rossi": 7, "barrow": 7, "mehrab": 7, "tanjim": 7, "sungchul": 7, "franck": 7, "dernoncourt": 7, "ruiyi": 7, "nesreen": 7, "2309": 7, "00770": 7, "h44z": 7, "hgp": 7, "saadia": 7, "hamid": 7, "palangi": 7, "dipankar": 7, "ec": 7, "kamar": 7, "oxi": 7, "smaranda": 7, "muresan": 7, "preslav": 7, "nakov": 7, "alin": 7, "villavicencio": 7, "editor": 7, "60th": 7, "3309": 7, "3326": 7, "dublin": 7, "aclanthologi": 7, "acl": 7, "18653": 7, "hym": 7, "weijiang": 7, "weitao": 7, "weihong": 7, "zhangyin": 7, "haotian": 7, "qianglong": 7, "weihua": 7, "xiaocheng": 7, "bing": 7, "dx": 7, "1145": [7, 8], "3703155": 7, "ldw": 7, "lijun": 7, "ruohui": 7, "xuhao": 7, "wangmeng": 7, "zuo": 7, "dahua": 7, "qiao": 7, "shao": 7, "05044": 7, "mpy": 7, "xuwang": 7, "zifan": 7, "norman": 7, "mu": 7, "elham": 7, "sakhae": 7, "nathaniel": 7, "forsyth": 7, "04249": 7, "mlc24": 7, "illumin": 7, "ailumin": 7, "oaa": 7, "adler": 7, "ahmad": 7, "ilg": 7, "akkaya": 7, "florencia": 7, "leoni": 7, "aleman": 7, "janko": 7, "altenschmidt": 7, "altman": 7, "shyamal": 7, "anadkat": 7, "avila": 7, "valeri": 7, "balcom": 7, "baltescu": 7, "haim": 7, "belgum": 7, "irwan": 7, "bello": 7, "jake": 7, "berdin": 7, "bernadett": 7, "shapiro": 7, "berner": 7, "lenni": 7, "bogdonoff": 7, "boiko": 7, "madelain": 7, "boyd": 7, "luisa": 7, "brakman": 7, "button": 7, "rosi": 7, "campbel": 7, "cann": 7, "brittani": 7, "carei": 7, "carlson": 7, "rori": 7, "carmichael": 7, "che": 7, "foti": 7, "sulli": 7, "rubi": 7, "chess": 7, "chester": 7, "cho": 7, "hyung": 7, "won": 7, "chung": 7, "jeremiah": 7, "currier": 7, "yunx": 7, "cori": 7, "decareaux": 7, "degri": 7, "deutsch": 7, "devil": 7, "dhar": 7, "steve": 7, "dowl": 7, "dun": 7, "adrien": 7, "ecoffet": 7, "atti": 7, "eleti": 7, "tyna": 7, "elound": 7, "farhi": 7, "niko": 7, "sim\u00f3n": 7, "posada": 7, "fishman": 7, "juston": 7, "isabella": 7, "fulford": 7, "georg": 7, "gibson": 7, "vik": 7, "tarun": 7, "gogineni": 7, "goh": 7, "rapha": 7, "gontijo": 7, "lope": 7, "gordon": 7, "morgan": 7, "grafstein": 7, "yufei": 7, "guo": 7, "hallaci": 7, "heaton": 7, "johann": 7, "heideck": 7, "hickei": 7, "wade": 7, "hoeschel": 7, "houghton": 7, "kenni": 7, "hsu": 7, "shengli": 7, "joost": 7, "huizinga": 7, "shawn": 7, "joann": 7, "jang": 7, "roger": 7, "haozhun": 7, "shino": 7, "jomoto": 7, "billi": 7, "jonn": 7, "tomer": 7, "kaftan": 7, "\u0142ukasz": 7, "kamali": 7, "ingmar": 7, "kanitscheid": 7, "tabarak": 7, "khan": 7, "logan": 7, "kilpatrick": 7, "jong": 7, "wook": 7, "christina": 7, "yongjik": 7, "hendrik": 7, "kirchner": 7, "kiro": 7, "matt": 7, "kokotajlo": 7, "kondraciuk": 7, "kondrich": 7, "konstantinidi": 7, "kosic": 7, "vishal": 7, "kuo": 7, "lamp": 7, "ikai": 7, "teddi": 7, "jade": 7, "leung": 7, "chak": 7, "ming": 7, "lim": 7, "molli": 7, "mateusz": 7, "litwin": 7, "theresa": 7, "lopez": 7, "patricia": 7, "lue": 7, "makanju": 7, "malfacini": 7, "markov": 7, "yaniv": 7, "markovski": 7, "bianca": 7, "mayn": 7, "mckinnei": 7, "christin": 7, "mcleavei": 7, "mcmillan": 7, "mcneil": 7, "aalok": 7, "menick": 7, "mishchenko": 7, "vinni": 7, "monaco": 7, "murk": 7, "m\u00e9ly": 7, "ashvin": 7, "nair": 7, "reiichiro": 7, "nakano": 7, "rajeev": 7, "nayak": 7, "arvind": 7, "neelakantan": 7, "hyeonwoo": 7, "noh": 7, "keef": 7, "jakub": 7, "pachocki": 7, "palermo": 7, "ashlei": 7, "pantuliano": 7, "parish": 7, "emi": 7, "parparita": 7, "passo": 7, "perelman": 7, "belbut": 7, "pere": 7, "pokorni": 7, "pokrass": 7, "vitchyr": 7, "pong": 7, "tolli": 7, "powel": 7, "bori": 7, "proehl": 7, "rae": 7, "ramesh": 7, "franci": 7, "kendra": 7, "rimbach": 7, "carl": 7, "rotst": 7, "roussez": 7, "saltarelli": 7, "ted": 7, "sander": 7, "schnurr": 7, "selsam": 7, "kyla": 7, "sheppard": 7, "toki": 7, "sherbakov": 7, "shieh": 7, "shoker": 7, "pranav": 7, "szymon": 7, "sidor": 7, "sigler": 7, "sitkin": 7, "sokolowski": 7, "natali": 7, "staudach": 7, "madelein": 7, "tootoonchian": 7, "tseng": 7, "preston": 7, "tuggl": 7, "turlei": 7, "juan": 7, "cer\u00f3n": 7, "urib": 7, "vallon": 7, "vijayvergiya": 7, "jai": 7, "alvin": 7, "ward": 7, "cj": 7, "weinmann": 7, "akila": 7, "welihinda": 7, "jiayi": 7, "weng": 7, "lilian": 7, "wiethoff": 7, "willner": 7, "wolrich": 7, "lauren": 7, "workman": 7, "sherwin": 7, "yoo": 7, "zeller": 7, "shengjia": 7, "juntang": 7, "zhuk": 7, "2303": 7, "08774": 7, "pnc": 7, "inkit": 7, "manish": 7, "nagireddi": 7, "giandomenico": 7, "cornacchia": 7, "subhajit": 7, "chaudhuri": 7, "tejaswini": 7, "pedapati": 7, "pierr": 7, "dognin": 7, "keerthiram": 7, "murugesan": 7, "miehl": 7, "santill\u00e1n": 7, "kieran": 7, "giulio": 7, "zizzo": 7, "muhammad": 7, "zaid": 7, "hame": 7, "purcel": 7, "desmond": 7, "pan": 7, "ing": 7, "vejsbjerg": 7, "dali": 7, "hind": 7, "werner": 7, "geyer": 7, "ambrish": 7, "rawat": 7, "kush": 7, "varshnei": 7, "prasanna": 7, "sattigeri": 7, "07724": 7, "saffron": 7, "ring": 7, "aslanid": 7, "glaes": 7, "nat": 7, "mcalees": 7, "irv": 7, "2202": 7, "03286": 7, "szw": 7, "qinghua": 7, "higham": 7, "gorban": 7, "bastouni": 7, "ivan": 7, "tyukin": 7, "12670": 7, "vsk": 7, "simplesafetytest": 7, "2311": 7, "08370": 7, "wmr24": 7, "sandra": 7, "brent": 7, "mittelstadt": 7, "duti": 7, "royal": 7, "240197": 7, "royalsocietypublish": 7, "1098": 7, "rso": 7, "ylx24": 7, "jiahao": 7, "xingwei": 7, "zyi": 7, "shune": 7, "lyumanshan": 7, "jingyu": 7, "shui": 7, "haobin": 7, "pengfei": 7, "hewu": 7, "ghost": 7, "14931": 7, "zho24": 7, "amazonwservices24": 7, "anthropic24": 7, "cdn": 7, "1adf000c8f675958c2ee23805d91aaade1cd4613": 7, "centerfasafety24a": 7, "centerforaisafeti": 7, "centerfasafety24b": 7, "deepmind24": 7, "googleapi": 7, "fsf": 7, "europeanmagency24": 7, "ema": 7, "europa": 7, "activities_en": 7, "financialirauthority24": 7, "ibm24": 7, "watsonx": 7, "saa": 7, "libraryocongress23": 7, "loc": 7, "gov": 7, "mistralai24": 7, "mlsteam24": 7, "mlsafeti": 7, "nationaliosatechnology24": 7, "nist": 7, "itl": 7, "nvidia24": 7, "openai24a": 7, "openai24b": 7, "opensafetylab24a": 7, "opensafetylab24b": 7, "protectai24": 7, "surgeai24": 7, "ukgovernment24": 7, "unicef24": 7, "innocenti": 7, "julia": 8, "easili": 8, "trial": 8, "wrangl": 8, "hoc": 8, "dataset": 8, "unwant": 8, "overflow": 8, "twitter": 8, "youtub": 8, "ldot": 8, "prod_": 8, "syntact": 8, "central": 8, "delic": 8, "heart": 8, "xml": 8, "invalid": 8, "ttt": 8, "itt": 8, "nousresearch": 8, "herm": 8, "json_format": 8, "person1": 8, "q1": 8, "person2": 8, "response_cont": 8, "is_json": 8, "myjson": 8, "nest": 8, "conceptu": 8, "overview": 8, "unend": 8, "whitespac": 8, "throw": 8, "somewher": 8, "json_object": 8, "circul": 8, "vertex": 8, "worri": 8, "enum": 8, "secextract": 8, "mentioned_ent": 8, "mentioned_plac": 8, "extract_from_sec_fil": 8, "sec_filing_text": 8, "parser": 8, "hint": 8, "prompt_extract": 8, "sec_extract": 8, "washington": 8, "beg": 8, "1652": 8, "171": 8, "unnorm": 8, "0325": 8, "strongest": 8, "bfloat16": 8, "device_map": 8, "src": 8, "python3": 8, "nvml": 8, "return_tensor": 8, "pt": 8, "inference_mod": 8, "last_token_logit": 8, "next_token_prob": 8, "nn": 8, "dim": 8, "top_k_prob": 8, "top_k_indic": 8, "topk": 8, "top_k_token": 8, "decod": 8, "idx": 8, "skip_special_token": 8, "prob": 8, "0305": 8, "0197": 8, "0106": 8, "0093": 8, "logitsprocessor": 8, "logits_processor": 8, "logitsprocessorlist": 8, "customlogitsprocessor": 8, "intermediari": 8, "input_id": 8, "__call__": 8, "longtensor": 8, "batch_siz": 8, "sequence_length": 8, "floattensor": 8, "vocab_s": 8, "mask": 8, "pick": 8, "greedi": 8, "yesnologitsprocessor": 8, "initial_length": 8, "fill_": 8, "inf": 8, "debug": 8, "yes_token": 8, "add_special_token": 8, "no_token": 8, "yes_no_logit": 8, "yes_no_prob": 8, "yes_prob": 8, "no_prob": 8, "yes_mask": 8, "1e4": 8, "NO": 8, "generation_output_control": 8, "uncontrol": 8, "generation_output": 8, "renorm": 8, "4263": 8, "5737": 8, "10407": 8, "4607": 8, "6250": 8, "9219": 8, "helper": 8, "model_output": 8, "gen_output": 8, "batch_decod": 8, "clean_up_tokenization_spac": 8, "classic": 8, "italian": 8, "willard": 8, "louf": 8, "reformul": 8, "finit": 8, "fsm": 8, "s_": 8, "s_t": 8, "s_1": 8, "tild": 8, "odot": 8, "rightarrow": 8, "wise": 8, "thien": 8, "automaton": 8, "dfa": 8, "outgo": 8, "yy": 8, "ever": 8, "aa": 8, "lwai": 8, "prop": 8, "yynnaa": 8, "malform": 8, "base_prompt": 8, "sec_extraction_outlin": 8, "zsp": 8, "zicorp": 8, "with_structured_output": 8, "runnabl": 8, "typeddict": 8, "qu": 8, "langchain_openai": 8, "chatopenai": 8, "langchain_cor": 8, "chatprompttempl": 8, "extract_from_sec_filing_langchain": 8, "structured_llm": 8, "prompt_templ": 8, "from_messag": 8, "llm_chain": 8, "invok": 8, "sec_extraction_langchain": 8, "bnf": 8, "backu": 8, "naur": 8, "fssl": 8, "extract_entities_from_sec_fil": 8, "ollama_structured_output_prompt_suffix": 8, "ollama_structured_output_temperatur": 8, "uncensor": 8, "model_json_schema": 8, "response_json": 8, "sharpli": 8, "wrapper": 8, "exllama2": 8, "zoo": 8, "furthermor": 8, "nonetheless": 8, "extran": 8, "dispar": 8, "preval": 8, "speak": 8, "aider": 8, "outweigh": 8, "rebutt": 8, "reproduct": 8, "paint": 8, "dottxt": 8, "flaw": 8, "uneven": 8, "conflat": 8, "drawback": 8, "pfiffer": 8, "wrestl": 8, "aid24": 8, "dot24": 8, "demo": 8, "gge24": 8, "lan4b": 8, "lww": 8, "xun": 8, "hanyu": 8, "yezhaohui": 8, "shichao": 8, "simin": 8, "shunyu": 8, "feiyu": 8, "xiong": 8, "12599": 8, "llf": 8, "xieyang": 8, "frederick": 8, "fiannaca": 8, "terri": 8, "koo": 8, "dixon": 8, "ea": 8, "machineri": 8, "3613905": 8, "3650756": 8, "xuan": 8, "hai": 8, "nguyen": 8, "ngoc": 8, "tiviati": 8, "hieu": 8, "dao": 8, "shafiq": 8, "joti": 8, "kenji": 8, "kawaguchi": 8, "nanci": 8, "min": 8, "kan": 8, "08656": 8, "out24": 8, "twt": 8, "zhi": 8, "cheng": 8, "kuang": 8, "tsai": 8, "chieh": 8, "hung": 8, "yun": 8, "nung": 8, "02442": 8, "tt24": 8, "vivien": 8, "vivien000": 8, "wl23": 8, "r\u00e9mi": 8, "09702": 8, "guidanceai24": 8, "nvidia4a": 8, "wikipediacontributors24": 8, "wiktionari": 8, "naur_form": 8}, "objects": {}, "objtypes": {}, "objnames": {}, "titleterms": {"about": [0, 2], "book": [0, 2], "content": [0, 3, 4, 5, 6, 7, 8], "core": 0, "challeng": 0, "we": 0, "ll": 0, "address": 0, "A": [0, 2, 3, 4], "practic": [0, 2, 6, 8], "approach": [0, 4, 7], "an": 0, "open": [0, 2, 6], "sourc": [0, 2, 6], "note": [0, 3], "perspect": 0, "who": 0, "thi": 0, "i": [0, 3], "For": 0, "outcom": 0, "prerequisit": 0, "set": 0, "up": 0, "your": [0, 6], "environ": 0, "code": 0, "repositori": 0, "python": 0, "setup": [0, 3], "api": [0, 7], "kei": [0, 5], "configur": 0, "troubleshoot": 0, "common": [0, 7], "issu": 0, "author": 0, "prefac": [1, 2], "tame": 2, "llm": [2, 4, 5, 6, 7], "guid": 2, "pitfal": [2, 7], "softwar": [2, 5], "chapter": 2, "1": [2, 7], "The": [2, 4, 5, 6], "eval": [2, 5, 7], "gap": [2, 5], "2": [2, 6, 7], "manag": 2, "input": 2, "data": [2, 3], "3": [2, 7], "structur": [2, 8], "output": [2, 8], "4": [2, 7], "safeti": [2, 7], "5": [2, 7], "prefer": [2, 3], "base": [2, 3, 5, 7], "align": [2, 3], "6": [2, 7], "local": [2, 6], "7": 2, "fall": [2, 4], "cost": [2, 4, 6], "paradox": [2, 4], "8": 2, "frontier": 2, "appendix": 2, "tool": [2, 5, 6, 7, 8], "resourc": 2, "introduct": [3, 5, 6, 7, 8], "from": 3, "raw": 3, "capabl": 3, "On": 3, "misalign": 3, "languag": 3, "model": [3, 5, 6], "human": 3, "supervis": 3, "fine": [3, 6, 8], "tune": [3, 6, 8], "sft": 3, "augment": 3, "post": [3, 8], "train": 3, "answer": 3, "limit": 3, "collaps": 3, "fake": 3, "case": [3, 6, 7], "studi": [3, 6, 7], "polici": [3, 7], "experiment": 3, "deliver": 3, "smollm2": 3, "dataset": [3, 5, 6, 7], "synthet": 3, "gener": [3, 5, 7], "user": [3, 7], "prompt": [3, 6, 8], "reject": 3, "respons": 3, "chosen": 3, "dpo": 3, "optim": [3, 4], "prepar": 3, "vibe": 3, "check": [3, 4], "evalu": [3, 5, 7], "discuss": [3, 8], "conclus": [3, 4, 5, 6, 7, 8], "citat": [3, 5, 7, 8], "refer": [3, 4, 5, 6, 7, 8], "why": 4, "matter": 4, "more": 4, "than": 4, "ever": 4, "right": 4, "size": 4, "strateg": 4, "metric": [4, 5], "requir": [4, 5], "busi": 4, "perform": [4, 6], "oper": 4, "technic": [4, 7], "quantiz": [4, 6], "list": 4, "non": 5, "determinist": 5, "machin": 5, "emerg": 5, "properti": 5, "problem": [5, 8], "statement": [5, 8], "tradit": 5, "v": [5, 6], "design": [5, 7], "applic": 5, "test": 5, "matrix": 5, "conceptu": 5, "overview": 5, "consider": 5, "task": [5, 6], "benchmark": [5, 6, 7], "leaderboard": 5, "lightev": 5, "mmlu": 5, "econometr": 5, "sampl": [5, 7], "famili": [5, 6], "us": 5, "langsmith": 5, "promptfoo": 5, "comparison": [5, 6, 8], "choos": 6, "suitabl": 6, "result": 6, "llama": 6, "licens": 6, "commun": 6, "support": 6, "custom": [6, 7], "mistral": [6, 7], "decemb": 6, "22": 6, "2024": 6, "deploy": 6, "serv": 6, "cpp": 6, "llamafil": 6, "ollama": [6, 8], "lama": 6, "ui": 6, "lm": 6, "studio": 6, "jan": 6, "webui": 6, "openwebui": 6, "effect": 6, "level": 6, "hardwar": 6, "takeawai": [6, 7], "risk": 7, "ai": 7, "amplifi": 7, "exist": 7, "harm": 7, "novel": 7, "associ": 7, "autonom": 7, "exacerb": 7, "factor": 7, "specif": 7, "guidanc": 7, "govern": 7, "organ": 7, "privat": 7, "sector": 7, "openai": 7, "anthrop": 7, "googl": 7, "rubric": 7, "mlcommon": 7, "centr": 7, "porquoi": 7, "red": 7, "team": 7, "constitut": 7, "explain": 7, "xai": 7, "plan": 7, "phase": 7, "definit": 7, "research": [7, 8], "identif": 7, "framework": [7, 8], "architectur": 7, "implement": 7, "select": 7, "go": 7, "market": 7, "compon": 7, "salad": 7, "bench": 7, "truthfulqa": 7, "harmbench": 7, "safebench": 7, "techniqu": [7, 8], "repres": 7, "layer": 7, "map": 7, "rule": 7, "filter": 7, "moder": 7, "bad": 7, "good": 7, "guard": 7, "judg": 7, "valid": 7, "engin": 8, "json": 8, "mode": 8, "logit": 8, "process": 8, "outlin": 8, "langchain": 8, "best": 8, "compar": 8, "solut": 8, "ongo": 8, "debat": 8, "acknowledg": 8}, "envversion": {"sphinx.domains.c": 2, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 8, "sphinx.domains.index": 1, "sphinx.domains.javascript": 2, "sphinx.domains.math": 2, "sphinx.domains.python": 3, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinxcontrib.bibtex": 9, "sphinx": 57}, "alltitles": {"About the Book": [[0, "about-the-book"], [2, "about-the-book"]], "Contents": [[0, "contents"], [3, "contents"], [4, "contents"], [5, "contents"], [6, "contents"], [7, "contents"], [8, "contents"]], "Core Challenges We\u2019ll Address": [[0, "core-challenges-we-ll-address"]], "A Practical Approach": [[0, "a-practical-approach"]], "An Open Source Approach": [[0, "an-open-source-approach"]], "Open Source Book": [[0, "open-source-book"]], "A Note on Perspective": [[0, "a-note-on-perspective"]], "Who This Book Is For": [[0, "who-this-book-is-for"]], "Outcomes": [[0, "outcomes"]], "Prerequisites": [[0, "prerequisites"]], "Setting Up Your Environment": [[0, "setting-up-your-environment"]], "Code Repository": [[0, "code-repository"]], "Python Environment Setup": [[0, "python-environment-setup"]], "API Keys Configuration": [[0, "api-keys-configuration"]], "Troubleshooting Common Issues": [[0, "troubleshooting-common-issues"]], "About the Author": [[0, "about-the-author"]], "Preface": [[1, "preface"], [2, "preface"]], "Taming LLMs": [[2, "taming-llms"]], "A Practical Guide to LLM Pitfalls with Open Source Software": [[2, "a-practical-guide-to-llm-pitfalls-with-open-source-software"]], "Chapter 1: The Evals Gap": [[2, "chapter-1-the-evals-gap"]], "Chapter 2: Managing Input Data": [[2, "chapter-2-managing-input-data"]], "Chapter 3: Structured Output": [[2, "chapter-3-structured-output"]], "Chapter 4: Safety": [[2, "chapter-4-safety"]], "Chapter 5: Preference-Based Alignment": [[2, "chapter-5-preference-based-alignment"]], "Chapter 6: Local LLMs in Practice": [[2, "chapter-6-local-llms-in-practice"]], "Chapter 7: The Falling Cost Paradox": [[2, "chapter-7-the-falling-cost-paradox"]], "Chapter 8: Frontiers": [[2, "chapter-8-frontiers"]], "Appendix A: Tools and Resources": [[2, "appendix-a-tools-and-resources"]], "Preference-Based Alignment": [[3, "preference-based-alignment"]], "Introduction": [[3, "introduction"], [5, "introduction"], [6, "introduction"], [7, "introduction"], [8, "introduction"]], "From Raw Capabilities to Preference Alignment": [[3, "from-raw-capabilities-to-preference-alignment"]], "On the Misalignment of Language Models": [[3, "on-the-misalignment-of-language-models"]], "Aligning Language Models with Human Preferences": [[3, "aligning-language-models-with-human-preferences"]], "Supervised Fine-Tuning (SFT) for Model Alignment": [[3, "supervised-fine-tuning-sft-for-model-alignment"]], "Augmenting SFT with Human Preferences": [[3, "augmenting-sft-with-human-preferences"]], "Is Post-Training the Answer?": [[3, "is-post-training-the-answer"]], "Limitations": [[3, "limitations"]], "Model Collapse": [[3, "model-collapse"]], "Faking Alignment": [[3, "faking-alignment"]], "Case Study: Aligning a Language Model to a Policy": [[3, "case-study-aligning-a-language-model-to-a-policy"]], "Experimental Setup": [[3, "experimental-setup"]], "Deliverables": [[3, "deliverables"]], "A Note on smolLM2 Models": [[3, "a-note-on-smollm2-models"]], "Policy": [[3, "policy"]], "Preference Dataset - Synthetic Dataset Generation": [[3, "preference-dataset-synthetic-dataset-generation"]], "User Prompts": [[3, "user-prompts"]], "Rejected Responses": [[3, "rejected-responses"]], "Chosen Responses": [[3, "chosen-responses"]], "Generate DPO Dataset": [[3, "generate-dpo-dataset"]], "DPO-Based Optimization": [[3, "dpo-based-optimization"]], "Data Preparation": [[3, "data-preparation"]], "Fine-Tuning": [[3, "fine-tuning"]], "Vibe Check": [[3, "vibe-check"]], "Alignment Evaluation": [[3, "alignment-evaluation"]], "Discussion and Conclusions": [[3, "discussion-and-conclusions"]], "Citation": [[3, "citation"], [5, "citation"], [7, "citation"], [8, "citation"]], "References": [[3, "references"], [4, "references"], [5, "references"], [6, "references"], [7, "references"], [8, "references"]], "The Falling Cost Paradox": [[4, "the-falling-cost-paradox"]], "Why Optimization Matters More Than Ever": [[4, "why-optimization-matters-more-than-ever"]], "Right-Sizing LLMs: A Strategic Approach": [[4, "right-sizing-llms-a-strategic-approach"]], "Metrics": [[4, "metrics"], [5, "metrics"]], "Requirements": [[4, "requirements"]], "Business Requirements": [[4, "business-requirements"]], "Performance Requirements": [[4, "performance-requirements"]], "Operational Requirements": [[4, "operational-requirements"]], "Technical Requirements": [[4, "technical-requirements"]], "Quantization": [[4, "quantization"], [6, "quantization"]], "Check-list": [[4, "check-list"]], "Conclusion": [[4, "conclusion"], [5, "conclusion"], [6, "conclusion"], [7, "conclusion"], [8, "conclusion"]], "The Evals Gap": [[5, "the-evals-gap"]], "Non-Deterministic Generative Machines": [[5, "non-deterministic-generative-machines"]], "Emerging Properties": [[5, "emerging-properties"]], "Problem Statement": [[5, "problem-statement"], [8, "problem-statement"]], "Evals of Traditional Software vs LLMs": [[5, "evals-table"]], "Evals Design": [[5, "evals-design"]], "LLM Application Testing Requirements Matrix": [[5, "validation-requirements"]], "Conceptual Overview": [[5, "conceptual-overview"]], "Design Considerations": [[5, "design-considerations"]], "Key Metrics for Evaluating Generative Tasks": [[5, "key-metrics"]], "Evaluators": [[5, "evaluators"]], "Model-Based Evaluation": [[5, "model-based-evaluation"]], "Evaluating Evaluators": [[5, "evaluating-evaluators"]], "Benchmarks and Leaderboards": [[5, "benchmarks-and-leaderboards"]], "Tools": [[5, "tools"], [8, "tools"]], "LightEval": [[5, "lighteval"]], "MMLU Econometrics Task Dataset sample": [[5, "mmlu-econometrics"]], "Model Families Evaluated Using LightEval": [[5, "model-families"]], "LangSmith": [[5, "langsmith"]], "PromptFoo": [[5, "promptfoo"]], "Comparison": [[5, "comparison"], [6, "comparison"], [6, "id37"]], "Comparison of Lighteval, LangSmith, and Promptfoo": [[5, "tool-comparison"]], "Local LLMs in Practice": [[6, "local-llms-in-practice"]], "Choosing your Model": [[6, "choosing-your-model"]], "Task Suitability": [[6, "task-suitability"]], "Benchmark results for Llama 2 family of models.": [[6, "llama2-benchmark"]], "Performance & Cost": [[6, "performance-cost"]], "Licensing": [[6, "licensing"]], "Open Source LLMs.": [[6, "open-source-llms"]], "Community Support": [[6, "community-support"]], "Customization": [[6, "customization"]], "Mistral fine-tuning costs as of December 22, 2024.": [[6, "mistral-costs"]], "Tools for Local LLM Deployment": [[6, "tools-for-local-llm-deployment"]], "Serving Models": [[6, "serving-models"]], "LLama.cpp": [[6, "llama-cpp"]], "Llamafile": [[6, "llamafile"]], "Ollama": [[6, "ollama"], [8, "ollama"]], "lama.cpp vs Ollama vs Llamafile Comparison": [[6, "feature-comparison-local"]], "UI": [[6, "ui"]], "LM Studio": [[6, "lm-studio"]], "Jan": [[6, "jan"]], "Open WebUI": [[6, "open-webui"]], "LM Studio vs Jan vs OpenWebUI Comparison": [[6, "feature-comparison-ui"]], "Case Study: The Effect of Quantization on LLM Performance": [[6, "case-study-the-effect-of-quantization-on-llm-performance"]], "Prompts Dataset": [[6, "prompts-dataset"]], "Quantization Levels": [[6, "quantization-levels"]], "Benchmarking": [[6, "benchmarking"], [7, "benchmarking"]], "Results": [[6, "results"]], "Quantization Benchmarks": [[6, "quantization-benchmarks"]], "Benchmarking Hardware": [[6, "benchmarking-hardware"]], "Takeaways": [[6, "takeaways"], [7, "takeaways"]], "Safety": [[7, "safety"]], "Safety Risks": [[7, "safety-risks"]], "General AI Safety Risks": [[7, "general-ai-safety-risks"]], "Amplified Existing Harms and Novel Risks": [[7, "amplified-existing-harms-and-novel-risks"]], "Risks Associated with Autonomous AI": [[7, "risks-associated-with-autonomous-ai"]], "Exacerbating Factors": [[7, "exacerbating-factors"]], "LLMs Specific Safety Risks": [[7, "llms-specific-safety-risks"]], "Guidance": [[7, "guidance"]], "Governments & Organizations": [[7, "governments-organizations"]], "Private Sector": [[7, "private-sector"]], "OpenAI": [[7, "openai"]], "Anthropic": [[7, "anthropic"]], "Google": [[7, "google"]], "Rubrics": [[7, "rubrics"]], "MLCommons AI Safety Benchmark": [[7, "mlcommons-ai-safety-benchmark"]], "Centre for the Governance of AI Rubric": [[7, "centre-for-the-governance-of-ai-rubric"]], "Porquoi": [[7, "porquoi"]], "Approaches": [[7, "approaches"]], "Red Teaming": [[7, "red-teaming"]], "Constitutional AI": [[7, "constitutional-ai"]], "Explainable AI (XAI)": [[7, "explainable-ai-xai"]], "Designing a Safety Plan": [[7, "designing-a-safety-plan"]], "Phase 1. Policy Definition": [[7, "phase-1-policy-definition"]], "Phase 2. User Research & Risk Identification": [[7, "phase-2-user-research-risk-identification"]], "Phase 3. Evaluation Framework": [[7, "phase-3-evaluation-framework"]], "Phase 4. Safety Architecture Design": [[7, "phase-4-safety-architecture-design"]], "Phase 5. Implementation & Tools Selection": [[7, "phase-5-implementation-tools-selection"]], "Phase 6. Go-to-Market": [[7, "phase-6-go-to-market"]], "Common Pitfalls": [[7, "common-pitfalls"]], "Technical Implementation Components": [[7, "technical-implementation-components"]], "Benchmarks & Datasets": [[7, "benchmarks-datasets"]], "SALAD-Bench": [[7, "salad-bench"]], "TruthfulQA": [[7, "truthfulqa"]], "HarmBench": [[7, "harmbench"]], "SafeBench": [[7, "safebench"]], "Tools & Techniques": [[7, "tools-techniques"]], "Representative Safety Layer Risk Map.": [[7, "safety-layer-table"]], "Rules-Based Safety Filtering": [[7, "rules-based-safety-filtering"]], "Rules-Based Safety Filtering Tools.": [[7, "safety-layer-tools"]], "LLM-Based Safety Filtering": [[7, "llm-based-safety-filtering"]], "Custom Moderation": [[7, "custom-moderation"]], "Case Study: Implementing a Safety Filter": [[7, "case-study-implementing-a-safety-filter"]], "Evals Dataset": [[7, "evals-dataset"]], "Bad Samples": [[7, "bad-samples"]], "Good Samples": [[7, "good-samples"]], "Safety Filters": [[7, "safety-filters"]], "LLM-Guard": [[7, "llm-guard"]], "Mistral Moderation API": [[7, "mistral-moderation-api"]], "OpenAI Moderation API": [[7, "openai-moderation-api"]], "Custom Judge Validator": [[7, "custom-judge-validator"]], "Structured Output": [[8, "structured-output"]], "Techniques": [[8, "techniques"]], "Prompt Engineering": [[8, "prompt-engineering"]], "JSON Mode (Fine-Tuned)": [[8, "json-mode-fine-tuned"]], "Logit Post-Processing": [[8, "logit-post-processing"]], "Outlines": [[8, "outlines"]], "LangChain": [[8, "langchain"]], "Discussion": [[8, "discussion"]], "Best Practices": [[8, "best-practices"]], "Comparing Solutions": [[8, "comparing-solutions"]], "Structured Output Frameworks Comparison": [[8, "structured-output-frameworks"]], "Research and Ongoing Debate": [[8, "research-and-ongoing-debate"]], "Acknowledgements": [[8, "acknowledgements"]]}, "indexentries": {}})
\ No newline at end of file
+Search.setIndex({"docnames": ["markdown/intro", "markdown/preface", "markdown/toc", "notebooks/alignment", "notebooks/cost", "notebooks/evals", "notebooks/input", "notebooks/local", "notebooks/safety", "notebooks/structured_output"], "filenames": ["markdown/intro.md", "markdown/preface.md", "markdown/toc.md", "notebooks/alignment.ipynb", "notebooks/cost.ipynb", "notebooks/evals.ipynb", "notebooks/input.ipynb", "notebooks/local.ipynb", "notebooks/safety.ipynb", "notebooks/structured_output.ipynb"], "titles": ["<span class=\"section-number\">2. </span>About the Book", "<span class=\"section-number\">1. </span>Preface", "Taming LLMs", "<span class=\"section-number\">7. </span>Preference-Based Alignment", "<span class=\"section-number\">9. </span>The Falling Cost Paradox", "<span class=\"section-number\">3. </span>The Evals Gap", "<span class=\"section-number\">5. </span>Managing Input Data", "<span class=\"section-number\">8. </span>Local LLMs in Practice", "<span class=\"section-number\">6. </span>Safety", "<span class=\"section-number\">4. </span>Structured Output"], "terms": {"am": [0, 8], "alwai": [0, 3, 4, 5, 6, 9], "do": [0, 3, 4, 5, 6, 7, 8, 9], "which": [0, 3, 4, 5, 6, 7, 8, 9], "cannot": [0, 3, 4, 5, 7, 8], "order": [0, 3, 5, 6, 8, 9], "mai": [0, 1, 3, 4, 5, 6, 7, 8, 9], "learn": [0, 3, 5, 6, 7, 8, 9], "how": [0, 1, 3, 4, 5, 6, 7, 8, 9], "pablo": [0, 5], "picasso": 0, "In": [0, 3, 4, 5, 6, 7, 8, 9], "recent": [0, 3, 4, 5, 6, 7, 8, 9], "year": [0, 2, 3, 4, 5, 6, 7, 8, 9], "larg": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], "languag": [0, 1, 2, 4, 5, 6, 7, 8, 9], "model": [0, 1, 2, 4, 6, 8, 9], "llm": [0, 1, 3, 6, 9], "have": [0, 1, 3, 4, 5, 6, 7, 8, 9], "emerg": [0, 3, 4, 6, 7, 8, 9], "transform": [0, 1, 3, 5, 6, 7, 8, 9], "forc": [0, 5, 9], "technologi": [0, 1, 4, 5, 6, 7, 8], "promis": [0, 3, 4, 5, 8], "revolution": [0, 8], "build": [0, 2, 3, 5, 6, 7, 8, 9], "product": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], "interact": [0, 3, 4, 5, 6, 7, 8, 9], "comput": [0, 3, 4, 5, 6, 7, 8, 9], "from": [0, 1, 4, 5, 6, 7, 8, 9], "chatgpt": [0, 3, 4, 7, 9], "github": [0, 2, 3, 4, 5, 7, 8, 9], "copilot": 0, "claud": [0, 3, 5, 7, 8], "artifact": 0, "system": [0, 3, 4, 5, 6, 7, 8, 9], "captur": [0, 1, 3, 5, 7, 8], "public": [0, 3, 5, 7, 8], "imagin": [0, 7], "spark": 0, "gold": [0, 3, 5, 6, 8], "rush": 0, "ai": [0, 3, 4, 5, 7, 9], "power": [0, 2, 3, 4, 5, 7, 8, 9], "applic": [0, 1, 2, 3, 4, 6, 7, 8, 9], "howev": [0, 3, 4, 5, 6, 7, 8, 9], "beneath": 0, "surfac": [0, 5], "technolog": [0, 1, 4, 5, 6, 8], "revolut": [0, 4], "li": [0, 3, 5, 7, 8, 9], "complex": [0, 1, 3, 5, 6, 7, 8, 9], "landscap": [0, 3, 5, 7], "practition": [0, 1, 4, 5, 7, 9], "must": [0, 3, 4, 5, 7, 8, 9], "navig": [0, 2, 5, 7, 8], "focus": [0, 3, 4, 5, 6, 7, 8, 9], "bring": [0, 3, 7], "awar": [0, 3, 4, 5, 6, 8], "limit": [0, 1, 2, 4, 5, 6, 7, 8, 9], "har": [0, 2, 5], "solut": [0, 2, 4, 5, 6, 7, 8], "overcom": [0, 5, 6], "them": [0, 1, 3, 4, 5, 6, 7, 8, 9], "robust": [0, 3, 4, 5, 6, 7, 8, 9], "It": [0, 3, 4, 5, 6, 7, 8, 9], "offer": [0, 3, 4, 5, 6, 7, 8, 9], "critic": [0, 2, 3, 4, 5, 6, 7, 8, 9], "implement": [0, 2, 3, 4, 5, 7, 9], "back": [0, 5, 6, 7, 8, 9], "reproduc": [0, 1, 2, 5, 7], "exampl": [0, 1, 2, 3, 5, 7, 8, 9], "while": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], "mani": [0, 1, 3, 4, 5, 6, 7, 8, 9], "resourc": [0, 3, 4, 5, 6, 7, 8], "cover": [0, 3, 4, 5, 6, 7, 8, 9], "capabl": [0, 1, 2, 4, 5, 6, 7, 8, 9], "specif": [0, 3, 4, 5, 6, 7, 9], "hidden": [0, 3, 8], "pitfal": [0, 1, 3, 4, 5, 6, 7, 9], "engin": [0, 1, 2, 3, 4, 5, 7, 8], "technic": [0, 1, 2, 3, 5, 7, 9], "manag": [0, 1, 4, 5, 7, 8, 9], "face": [0, 3, 4, 5, 6, 7, 8], "when": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], "comprehens": [0, 2, 3, 4, 5, 6, 7, 8, 9], "guid": [0, 1, 3, 4, 5, 6, 7, 8, 9], "leverag": [0, 3, 5, 6, 7, 8, 9], "battl": [0, 2, 7], "test": [0, 2, 3, 4, 6, 7, 8, 9], "tool": [0, 1, 3, 4, 6], "throughout": [0, 4, 5, 6, 7, 8], "tackl": [0, 3, 5, 6, 8], "follow": [0, 3, 4, 5, 6, 7, 8, 9], "non": [0, 3, 7, 8, 9], "exhaust": [0, 7], "list": [0, 3, 5, 6, 7, 8, 9], "structur": [0, 3, 4, 5, 7, 8], "un": 0, "reliabl": [0, 1, 3, 4, 5, 7, 8, 9], "struggl": [0, 1, 3, 5, 6, 7, 8, 9], "maintain": [0, 1, 3, 4, 5, 6, 7, 8, 9], "consist": [0, 1, 3, 4, 5, 6, 7, 8, 9], "output": [0, 1, 3, 5, 6, 7, 8], "format": [0, 3, 4, 5, 6, 7, 8, 9], "complic": [0, 8], "integr": [0, 1, 3, 4, 5, 6, 7, 8, 9], "larger": [0, 3, 4, 5, 6, 7, 8, 9], "make": [0, 3, 4, 5, 6, 7, 8, 9], "error": [0, 3, 5, 6, 8, 9], "handl": [0, 3, 4, 5, 6, 7, 8, 9], "more": [0, 1, 3, 5, 6, 7, 8, 9], "size": [0, 3, 5, 6, 7, 8, 9], "length": [0, 3, 5, 7, 9], "constraint": [0, 1, 3, 4, 5, 6, 7, 8, 9], "ar": [0, 1, 3, 4, 5, 6, 7, 8, 9], "sensit": [0, 3, 4, 5, 6, 7, 8], "input": [0, 3, 5, 7, 8, 9], "data": [0, 1, 4, 5, 7, 8, 9], "requir": [0, 3, 6, 7, 8, 9], "care": [0, 3, 4, 5, 6, 7, 8, 9], "strategi": [0, 3, 4, 5, 6, 7, 8, 9], "long": [0, 1, 3, 4, 5, 7, 8, 9], "form": [0, 3, 4, 5, 7, 8, 9], "unstructur": [0, 6, 7, 9], "effect": [0, 1, 3, 4, 5, 6, 8, 9], "tradit": [0, 3, 7, 8], "softwar": [0, 1, 3, 4, 6, 7, 8, 9], "methodologi": [0, 3, 5, 7, 8, 9], "break": [0, 1, 3, 4, 5, 6, 8], "down": [0, 1, 4, 5, 6, 7, 8], "deal": [0, 3, 7], "determinist": [0, 9], "gener": [0, 1, 4, 7, 9], "new": [0, 2, 3, 4, 5, 6, 7, 8, 9], "safeti": [0, 3, 5, 9], "align": [0, 4, 5, 6, 7, 8, 9], "can": [0, 1, 3, 4, 5, 6, 7, 8, 9], "harm": [0, 3, 5, 7], "bias": [0, 3, 5, 7, 8, 9], "inappropri": [0, 3, 8], "safeguard": [0, 5, 8], "monitor": [0, 3, 4, 5, 6, 7, 8], "ensur": [0, 3, 4, 5, 6, 7, 8, 9], "safe": [0, 3, 5, 8, 9], "deploy": [0, 3, 4, 5, 8, 9], "cost": [0, 3, 5, 6, 8, 9], "optim": [0, 1, 5, 6, 7, 8], "The": [0, 1, 3, 6, 8, 9], "financi": [0, 1, 3, 4, 5, 6, 8, 9], "oper": [0, 3, 5, 6, 7, 8, 9], "base": [0, 1, 4, 7, 9], "quickli": [0, 3, 4, 7], "becom": [0, 3, 4, 5, 6, 7, 8, 9], "prohibit": [0, 3, 5, 7], "without": [0, 1, 3, 4, 5, 7, 8, 9], "vendor": [0, 4, 5, 7], "lock": [0, 3, 4, 7], "cloud": [0, 3, 4, 5, 7, 8, 9], "provid": [0, 2, 3, 4, 5, 6, 7, 8, 9], "creat": [0, 1, 3, 4, 5, 6, 7, 8, 9], "signific": [0, 3, 4, 5, 6, 7, 8, 9], "depend": [0, 3, 4, 5, 6, 7, 9], "through": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], "proprietari": [0, 3, 7, 8, 9], "infrastructur": [0, 4, 7], "difficult": [0, 3, 5, 8], "switch": [0, 7], "self": [0, 3, 5, 6, 7, 8, 9], "host": [0, 4, 5, 7, 8], "take": [0, 2, 3, 4, 5, 6, 7, 8, 9], "hand": [0, 7, 8, 9], "focu": [0, 2, 3, 4, 5, 6, 7, 8, 9], "access": [0, 3, 4, 5, 6, 7, 8, 9], "all": [0, 1, 3, 4, 5, 6, 7, 8, 9], "fulli": [0, 3, 5, 8], "document": [0, 3, 4, 5, 7, 8, 9], "allow": [0, 5, 6, 7, 8, 9], "reader": [0, 2, 6], "replic": [0, 5, 8, 9], "result": [0, 3, 4, 5, 6, 8, 9], "exactli": [0, 5, 9], "design": [0, 1, 3, 7, 9], "run": [0, 3, 4, 5, 6, 7, 8, 9], "consum": [0, 3, 4, 5, 6, 7, 8, 9], "grade": [0, 3, 4, 5, 6, 7, 8], "hardwar": [0, 3, 4, 5], "expens": [0, 3, 4, 5, 6, 7, 8], "avail": [0, 3, 4, 5, 6, 7, 8, 9], "notebook": [0, 3, 9], "modifi": [0, 3, 5, 8, 9], "extend": [0, 3, 4, 5, 6, 7, 9], "built": [0, 5, 7, 8, 9], "us": [0, 1, 3, 4, 7, 8, 9], "free": [0, 1, 3, 5, 7, 8], "everyon": [0, 5, 7], "minim": [0, 3, 4, 5, 7, 8, 9], "framework": [0, 3, 4, 5, 7], "wai": [0, 3, 4, 5, 6, 7, 8, 9], "priorit": [0, 3, 5, 6, 7, 8], "transpar": [0, 3, 4, 5, 7, 8], "visibl": [0, 5], "being": [0, 3, 4, 5, 6, 7, 8, 9], "better": [0, 2, 3, 4, 5, 6, 7, 8], "understand": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], "custom": [0, 3, 5, 6, 9], "flexibl": [0, 4, 5, 7, 8, 9], "adapt": [0, 3, 4, 5, 7, 8], "case": [0, 4, 5, 9], "unlik": [0, 3, 5, 7], "black": [0, 3], "box": [0, 7], "commerci": [0, 3, 5, 7, 8, 9], "most": [0, 3, 4, 5, 6, 7, 8, 9], "freeli": [0, 9], "foster": [0, 3, 5, 8, 9], "reduc": [0, 3, 4, 5, 6, 7, 8, 9], "independ": [0, 5, 6, 8, 9], "freedom": [0, 7, 9], "architectur": [0, 3, 4, 5, 7, 9], "decis": [0, 3, 4, 5, 6, 7, 8], "keep": [0, 3, 5, 6, 7, 8], "principl": [0, 3, 5, 7, 8], "itself": [0, 3, 5, 6, 7, 8], "": [0, 1, 3, 4, 5, 6, 7, 8, 9], "live": [0, 1, 5, 6, 8], "evolv": [0, 3, 4, 5, 7, 8], "chang": [0, 3, 5, 6, 7, 8, 9], "encourag": [0, 3, 5, 8, 9], "report": [0, 3, 5, 6, 7, 8, 9], "suggest": [0, 3, 5, 7, 8, 9], "improv": [0, 3, 4, 5, 6, 7, 8, 9], "contribut": [0, 4, 5, 6, 7, 8], "via": [0, 3, 4, 5, 6, 7, 8, 9], "pull": [0, 7], "request": [0, 3, 4, 5, 6, 7, 8, 9], "share": [0, 3, 5, 6, 7, 8, 9], "own": [0, 3, 4, 5, 6, 7, 8], "experi": [0, 3, 4, 5, 6, 7, 8, 9], "commun": [0, 3, 4, 5, 6, 8, 9], "propos": [0, 4, 5, 6, 8], "chapter": [0, 3, 4, 5, 6, 7, 8, 9], "section": [0, 3, 4, 5, 6, 7, 8, 9], "found": [0, 3, 4, 5, 7, 9], "http": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], "com": [0, 2, 3, 4, 5, 6, 7, 8, 9], "souzatharsi": [0, 2, 3, 4, 5, 6, 7, 8, 9], "tamingllm": [0, 2, 3, 4, 5, 6, 7, 8, 9], "whether": [0, 3, 4, 5, 6, 7, 8, 9], "you": [0, 1, 3, 4, 5, 6, 7, 8, 9], "ve": [0, 7], "typo": [0, 8], "want": [0, 1, 3, 6, 7, 8, 9], "entir": [0, 4, 5, 6, 7, 9], "welcom": 0, "look": [0, 2, 3, 4, 5, 6, 7, 8], "our": [0, 1, 3, 4, 5, 6, 7, 8, 9], "goal": [0, 1, 3, 5, 6, 8, 9], "discourag": [0, 6], "enabl": [0, 3, 4, 5, 6, 7, 8, 9], "By": [0, 1, 2, 3, 5, 6, 8, 9], "upfront": [0, 2, 4], "equip": [0, 2, 5, 8], "avoid": [0, 3, 5, 7, 8, 9], "current": [0, 2, 3, 4, 5, 6, 8, 9], "discours": [0, 2], "around": [0, 2, 3, 5, 6, 7, 8, 9], "tend": [0, 2, 5, 8], "toward": [0, 3, 5, 8, 9], "extrem": [0, 3, 4, 5, 8], "either": [0, 3, 5, 6, 7, 8, 9], "uncrit": 0, "enthusiasm": 0, "wholesal": [0, 5], "dismiss": 0, "differ": [0, 3, 4, 5, 6, 7, 8, 9], "rather": [0, 1, 3, 4, 5, 6, 7, 8], "than": [0, 1, 3, 5, 6, 7, 8, 9], "theoret": [0, 3], "examin": [0, 3, 5, 6, 7, 8, 9], "first": [0, 1, 3, 4, 5, 6, 7, 8, 9], "everi": [0, 4, 5, 8], "concept": [0, 3, 5, 8], "illustr": [0, 3, 5, 6, 7, 8, 9], "execut": [0, 5, 7, 8], "immedi": [0, 3, 4, 5, 7], "analysi": [0, 1, 3, 4, 5, 6, 7, 8], "balanc": [0, 3, 4, 5, 7, 8, 9], "both": [0, 3, 4, 5, 6, 7, 8], "help": [0, 3, 4, 5, 6, 7, 8, 9], "inform": [0, 3, 4, 5, 6, 7, 8, 9], "intend": [0, 5, 7, 8], "develop": [0, 1, 3, 4, 5, 6, 7, 8, 9], "step": [0, 1, 3, 4, 5, 6, 7, 8, 9], "insight": [0, 3, 4, 5, 6, 7, 8, 9], "along": [0, 3, 4, 5, 7, 8], "guidanc": [0, 3, 9], "could": [0, 1, 3, 4, 5, 6, 7, 8, 9], "derail": 0, "project": [0, 3, 4, 5, 6, 7, 8], "earli": [0, 3, 4, 5, 8, 9], "befor": [0, 3, 4, 5, 6, 8, 9], "thei": [0, 1, 3, 4, 5, 6, 7, 8, 9], "costli": [0, 5, 6, 8], "problem": [0, 1, 2, 3, 4, 6, 7, 8], "too": [0, 1, 3, 5, 6, 7, 8], "late": [0, 3, 4, 8], "lifecycl": [0, 7, 8], "lead": [0, 1, 3, 4, 5, 6, 7, 8, 9], "genai": [0, 1, 3, 6, 8], "initi": [0, 1, 3, 4, 5, 6, 7, 8, 9], "leader": [0, 2, 5], "advoc": [0, 8], "anyon": [0, 8], "seek": [0, 5, 7, 8], "work": [0, 1, 3, 5, 6, 7, 8, 9], "typic": [0, 3, 4, 5, 7, 8, 9], "job": [0, 5, 6, 7, 8], "role": [0, 3, 5, 6, 7, 8, 9], "platform": [0, 5, 6, 7, 8, 9], "backend": [0, 3, 5], "exist": [0, 3, 4, 5, 7], "ml": [0, 6, 8], "transit": [0, 4, 5, 7, 9], "overse": 0, "motiv": [0, 3, 4, 5, 6, 9], "need": [0, 3, 4, 5, 6, 7, 8, 9], "readi": [0, 5, 8], "desir": [0, 3, 5, 9], "perform": [0, 3, 5, 6, 8, 9], "after": [0, 1, 3, 5, 6, 7, 8, 9], "read": [0, 3, 4, 5, 6, 8, 9], "implic": [0, 1, 3, 5, 8], "recommend": [0, 3, 5, 6, 7, 8, 9], "abl": [0, 3, 5, 9], "deploi": [0, 3, 5, 7, 8], "proper": [0, 3, 4, 7, 8, 9], "realist": [0, 3, 4, 8], "effort": [0, 5, 7, 8, 9], "estim": [0, 4, 5, 6, 8], "impact": [0, 3, 4, 5, 6, 7, 8, 9], "timelin": 0, "To": [0, 3, 5, 7, 8, 9], "should": [0, 3, 4, 5, 6, 7, 8, 9], "basic": [0, 3, 5, 7, 8], "program": [0, 5, 6, 7, 9], "knowledg": [0, 3, 5, 6, 7, 8], "introductori": [0, 1, 2], "langchain": [0, 5, 6], "e": [0, 1, 3, 4, 5, 6, 7, 8, 9], "g": [0, 3, 4, 5, 6, 7, 8, 9], "chat": [0, 3, 5, 6, 7, 8, 9], "prompt": [0, 4, 5, 6, 8], "templat": [0, 5, 6, 9], "openai": [0, 3, 5, 6, 7, 9], "anthrop": [0, 3, 6, 9], "similar": [0, 3, 4, 5, 6, 7, 9], "dive": [0, 4], "here": [0, 2, 3, 4, 5, 6, 7, 8, 9], "get": [0, 3, 4, 5, 6, 7, 8, 9], "start": [0, 3, 4, 5, 6, 7, 8, 9], "clone": [0, 3], "companion": 0, "git": 0, "cd": 0, "activ": [0, 3, 4, 5, 6, 7, 8], "virtual": [0, 5], "m": [0, 3, 5, 6, 7, 8, 9], "venv": [0, 9], "tame": [0, 3, 4, 5, 6, 7, 8, 9], "env": [0, 3, 5, 6, 8, 9], "bin": [0, 7], "On": [0, 5, 7, 9], "window": [0, 4, 5, 6, 7], "script": [0, 7], "try": [0, 1, 3, 5, 8, 9], "each": [0, 3, 4, 5, 6, 7, 8, 9], "contain": [0, 3, 4, 5, 6, 7, 8, 9], "possibl": [0, 3, 4, 5, 6, 7, 8, 9], "includ": [0, 1, 3, 4, 5, 6, 7, 8, 9], "necessari": [0, 3, 4, 5, 8], "instal": [0, 3, 5, 7, 9], "go": [0, 3, 5, 6, 9], "feel": [0, 7], "prefer": [0, 5, 6, 7, 8, 9], "packag": [0, 4, 5, 6, 7, 9], "pip": [0, 3, 5, 7, 9], "poetri": [0, 8], "file": [0, 3, 5, 6, 7, 8, 9], "root": [0, 3], "directori": [0, 5, 7], "add": [0, 3, 5, 6, 7, 8], "other": [0, 3, 4, 5, 6, 7, 8, 9], "openai_api_kei": [0, 3], "your_openai_api_key_her": 0, "never": [0, 9], "commit": [0, 3, 5, 8], "version": [0, 3, 4, 5, 7, 8, 9], "control": [0, 1, 3, 4, 5, 6, 7, 8, 9], "kept": [0, 5], "privat": [0, 5], "If": [0, 1, 3, 4, 5, 7, 8, 9], "encount": [0, 2, 5, 8], "rate": [0, 3, 4, 5, 6, 7, 8], "consid": [0, 3, 4, 5, 6, 7, 8, 9], "smaller": [0, 3, 4, 5, 6, 7, 9], "retri": [0, 9], "logic": [0, 1, 3, 5, 6, 8], "conflict": [0, 3, 5], "fresh": 0, "like": [0, 1, 3, 4, 5, 6, 7, 8, 9], "check": [0, 5, 6, 7, 8, 9], "page": [0, 5, 6, 7], "known": [0, 5, 6, 8, 9], "now": [0, 1, 3, 4, 5, 6, 7, 8, 9], "let": [0, 3, 4, 5, 6, 7, 8, 9], "begin": [0, 5, 7, 8, 9], "explor": [0, 1, 3, 4, 5, 6, 7, 8, 9], "dr": [0, 3], "tharsi": [0, 2, 3, 4, 5, 6, 7, 8, 9], "souza": [0, 2, 3, 4, 5, 6, 7, 8, 9], "scientist": [0, 1, 7, 8], "special": [0, 4, 5, 6, 7, 8, 9], "he": [0, 3, 5, 8], "lectur": 0, "columbia": 0, "univers": [0, 5, 7, 8], "master": [0, 4, 7, 9], "scienc": [0, 3, 5, 8], "appli": [0, 3, 5, 6, 7, 8, 9], "analyt": 0, "incom": [0, 5, 6], "head": [0, 3, 5, 6, 8, 9], "equiti": [0, 5, 6], "citadel": 0, "former": [0, 1, 5, 7], "senior": [0, 5], "vp": 0, "two": [0, 3, 4, 5, 6, 7, 8, 9], "sigma": [0, 3], "invest": [0, 3, 4, 5, 6, 8], "also": [0, 3, 4, 5, 6, 7, 8, 9], "enjoi": 0, "mentor": 0, "under": [0, 3, 4, 5, 7, 8, 9], "repres": [0, 3, 4, 5, 6, 7, 9], "student": [0, 3, 6, 8], "profession": [0, 3, 5, 8, 9], "divers": [0, 3, 4, 5, 6, 8], "global": [0, 5, 6, 8], "ecosystem": [0, 4, 5, 7], "With": [0, 3, 5, 7, 8, 9], "over": [0, 2, 3, 4, 5, 6, 7, 8, 9], "15": [0, 5, 6, 7, 8, 9], "deliv": [0, 4, 5, 7], "across": [0, 1, 3, 4, 5, 6, 7, 8, 9], "startup": 0, "fortun": 0, "500": [0, 3, 5, 6, 8], "compani": [0, 3, 4, 5, 6, 8, 9], "numer": [0, 4, 5, 8, 9], "scholarli": 0, "frequent": [0, 5, 7, 9], "speaker": [0, 5], "academ": [0, 3, 5, 8], "busi": [0, 5, 6, 7, 8], "confer": [0, 6, 9], "ground": [0, 3, 5, 7], "background": [0, 1, 5, 6, 7], "draw": [0, 3, 5, 8, 9], "scale": [0, 3, 4, 5, 6, 7, 8, 9], "stage": [0, 3, 8, 9], "major": [0, 3, 4, 5, 6, 7, 8, 9], "institut": [0, 5, 8], "well": [0, 3, 4, 5, 6, 7, 8, 9], "advis": [0, 3], "profit": [0, 5, 6, 8, 9], "organ": [0, 3, 4, 5, 6, 7], "uniqu": [0, 3, 4, 5, 6, 7, 8, 9], "bridg": [0, 6, 7, 8], "gap": [0, 1, 3, 4, 7, 8], "between": [0, 1, 3, 4, 5, 6, 7, 8, 9], "potenti": [0, 1, 3, 4, 5, 6, 7, 8, 9], "next": [0, 1, 3, 4, 5, 6, 7, 8, 9], "hold": [0, 3, 5], "ph": [0, 8], "d": [0, 3, 4, 5, 6, 7, 8, 9], "ucl": 0, "london": 0, "phil": [0, 8], "sc": 0, "b": [0, 4, 5, 7, 8, 9], "tell": [1, 3, 8], "mere": [1, 5], "what": [1, 3, 4, 5, 6, 7, 8, 9], "someth": [1, 5, 7], "i": [1, 2, 4, 5, 7, 8, 9], "emanuel": [1, 3, 5, 8], "derman": 1, "an": [1, 2, 3, 4, 5, 6, 7, 8, 9], "altern": [1, 3, 4, 5, 6, 7, 8], "titl": [1, 2, 3, 4, 5, 6, 7, 8, 9], "thi": [1, 2, 3, 4, 5, 6, 7, 8, 9], "book": [1, 5, 6], "been": [1, 3, 4, 5, 7, 8], "behav": 1, "badli": 1, "come": [1, 3, 5, 6, 7, 8, 9], "notic": [1, 3, 4, 5, 8], "parallel": [1, 3, 5, 7], "semin": [1, 8], "2011": 1, "coincident": 1, "just": [1, 3, 4, 5, 6, 7, 8, 9], "caution": 1, "against": [1, 3, 4, 5, 7, 8], "treat": [1, 5, 8], "perfect": [1, 5, 7], "represent": [1, 5, 6, 7, 8], "realiti": [1, 6, 8], "aim": [1, 3, 4, 5, 6, 7, 8, 9], "highlight": [1, 3, 5, 6, 7, 8, 9], "practic": [1, 3, 4, 5, 6, 8], "physicist": 1, "goldman": 1, "sach": 1, "quant": 1, "scientif": [1, 3, 5, 7], "fail": [1, 3, 5, 8], "we": [1, 3, 4, 5, 6, 7, 8, 9], "mistak": [1, 8], "approxim": [1, 4, 5, 9], "full": [1, 3, 4, 5, 6, 7, 8, 9], "assumpt": [1, 5, 8], "core": [1, 4, 5, 6, 7, 8], "premis": [1, 7], "hi": [1, 5, 8, 9], "aspect": [1, 3, 5, 6, 8], "world": [1, 3, 4, 5, 6, 7, 8, 9], "inher": [1, 2, 3, 5, 8, 9], "involv": [1, 3, 4, 5, 6, 7, 8, 9], "simplif": 1, "argu": [1, 4, 8, 9], "crise": 1, "2008": 1, "crash": 1, "occur": [1, 3, 5, 8], "partli": 1, "becaus": [1, 3, 5, 8], "peopl": [1, 3, 5, 7, 8], "put": [1, 5, 7], "much": [1, 3, 5, 6, 7], "faith": 1, "mathemat": [1, 5, 7, 9], "recogn": [1, 3, 5, 8], "human": [1, 4, 5, 6, 7, 8, 9], "behavior": [1, 3, 5, 7, 8], "market": [1, 4, 5, 6, 7, 9], "dynam": [1, 3, 5, 6, 8], "hallucin": [1, 3, 5, 6, 8, 9], "fact": [1, 3, 5, 6, 8], "reason": [1, 3, 5, 6, 7, 8, 9], "Their": [1, 5, 9], "respons": [1, 4, 5, 6, 7, 8, 9], "often": [1, 3, 4, 5, 6, 7, 8, 9], "convinc": [1, 3], "probabilist": [1, 5, 9], "train": [1, 4, 5, 7, 8, 9], "true": [1, 3, 4, 5, 6, 8, 9], "even": [1, 3, 4, 5, 7, 8, 9], "though": [1, 3, 4, 5, 7, 8, 9], "insist": 1, "machin": [1, 3, 7, 8, 9], "todai": [1, 4, 7, 9], "grow": [1, 3, 5, 7, 8, 9], "pervas": [1, 8], "belief": [1, 7, 8], "solv": [1, 3, 4, 5, 7, 8, 9], "ani": [1, 3, 4, 5, 6, 7, 8, 9], "context": [1, 3, 4, 5, 6, 7, 8, 9], "content": 1, "wish": [1, 5], "user": [1, 4, 5, 6, 7, 9], "moreov": 1, "were": [1, 3, 5, 7, 8, 9], "token": [1, 3, 4, 5, 6, 7, 8, 9], "predict": [1, 3, 5, 7, 8, 9], "chatbot": [1, 3, 5, 7, 8], "twist": [1, 8], "wrap": [1, 7, 9], "further": [1, 3, 4, 5, 6, 7, 8, 9], "daili": [1, 4, 7, 8], "life": [1, 5, 7, 8], "workflow": [1, 4, 5, 7, 8, 9], "affect": [1, 5, 6, 7, 8], "decid": [1, 3, 5, 6], "action": [1, 3, 5, 6, 8], "coupl": [1, 7], "lack": [1, 3, 5, 6, 8, 9], "pose": [1, 3, 5, 6, 8, 9], "risk": [1, 3, 4, 5, 6, 7], "still": [1, 4, 5, 7, 8], "figur": [1, 5, 7], "out": [1, 3, 4, 5, 6, 7, 8, 9], "serv": [1, 3, 4, 5, 6, 8, 9], "builder": [1, 7], "who": [1, 3, 5, 7, 8, 9], "remain": [1, 3, 4, 5, 6, 7, 8], "clear": [1, 3, 4, 5, 7, 8, 9], "ei": 1, "about": [1, 3, 4, 5, 6, 7, 8, 9], "therefor": [1, 3, 5, 7, 8], "end": [1, 3, 4, 5, 6, 7, 8, 9], "detail": [1, 3, 4, 5, 6, 7, 8, 9], "python": [1, 2, 5, 6, 7, 8, 9], "code": [1, 2, 3, 5, 6, 7, 8, 9], "diminish": [1, 3, 4, 5], "promot": [1, 3, 5, 8], "nuanc": [1, 3, 5, 6, 7, 8, 9], "acknowledg": [1, 5, 8], "within": [1, 3, 4, 5, 6, 7, 8, 9], "trustworthi": [1, 8], "taught": 1, "u": [1, 3, 5, 6, 8, 9], "where": [1, 3, 4, 5, 6, 7, 8, 9], "der11": 1, "why": [1, 3, 5, 8, 9], "confus": [1, 4, 8], "illus": 1, "disast": [1, 5], "wall": [1, 7], "street": [1, 7], "press": [1, 5, 7], "isbn": [1, 3, 5], "9781439165010": 1, "url": [1, 2, 3, 4, 5, 6, 7, 8, 9], "googl": [1, 5, 7, 9], "co": [1, 3, 4, 5, 7, 8], "uk": [1, 8], "id": [1, 5, 6, 7, 8, 9], "lke_cwm4wm8c": 1, "sign": [2, 5, 8], "up": [2, 3, 4, 5, 6, 7, 8], "receiv": [2, 3, 5, 6, 7, 8, 9], "updat": [2, 3, 4, 5, 6, 7, 8, 9], "abstract": [2, 5, 6, 8, 9], "heavili": [2, 3, 4, 5, 6, 8, 9], "gloss": 2, "fundament": [2, 3, 5, 7, 8, 9], "challeng": [2, 3, 4, 5, 6, 7, 8, 9], "convers": [2, 3, 4, 5, 6, 7, 8, 9], "kei": [2, 3, 4, 6, 7, 8, 9], "proven": [2, 4], "yet": [2, 3, 4, 5, 6, 7, 8], "concret": [2, 4, 8, 9], "sidestep": 2, "misc": [2, 3, 4, 5, 6, 7, 8, 9], "tharsistpsouza2024tamingllm": [2, 3, 4, 5, 6, 7, 8, 9], "author": [2, 3, 4, 5, 6, 7, 8, 9], "t": [2, 3, 4, 5, 6, 7, 8, 9], "p": [2, 3, 4, 5, 6, 7, 8, 9], "2024": [2, 3, 4, 5, 6, 8, 9], "journal": [2, 3, 4, 5, 6, 7, 8, 9], "repositori": [2, 3, 4, 5, 6, 7, 8, 9], "valu": [3, 5, 6, 7, 8, 9], "its": [3, 4, 5, 6, 7, 8, 9], "privileg": 3, "abov": [3, 5, 8], "soon": [3, 9], "lose": [3, 5], "dwight": 3, "eisenhow": 3, "releas": [3, 4, 5, 6, 7, 8, 9], "3": [3, 4, 5, 6, 7, 9], "5": [3, 4, 5, 6, 7, 9], "2022": [3, 5, 7, 8], "mark": [3, 5, 6, 7, 8], "pivot": [3, 5, 7], "moment": 3, "histori": [3, 4, 5, 7], "artifici": [3, 5, 7, 8], "intellig": [3, 5, 6, 7, 8], "five": [3, 5, 8], "dai": [3, 4, 5, 6, 7, 8, 9], "launch": [3, 5, 8], "attract": [3, 5], "million": [3, 4, 5, 7], "month": [3, 4, 5, 7, 8], "becam": [3, 4], "fastest": [3, 5, 8], "100": [3, 4, 5, 7, 8, 9], "monthli": [3, 4, 5], "rais": [3, 4, 5, 8], "intrigu": 3, "question": [3, 4, 5, 6, 7, 8, 9], "did": [3, 5, 6, 9], "dramat": [3, 4, 5, 7, 9], "predecessor": 3, "gpt": [3, 4, 5, 6, 7, 8, 9], "had": [3, 5], "same": [3, 5, 6, 7, 8, 9], "number": [3, 4, 5, 6, 7, 8, 9], "paramet": [3, 4, 5, 6, 7, 8, 9], "far": [3, 4, 7, 8], "less": [3, 4, 5, 6, 7, 8], "attent": [3, 4, 6, 7], "arguabl": [3, 6, 7], "feedback": [3, 5, 8, 9], "abil": [3, 4, 5, 6, 7, 8, 9], "least": [3, 5, 8], "ey": 3, "breakthrough": [3, 7, 8], "demonstr": [3, 4, 5, 6, 7, 8, 9], "crucial": [3, 4, 6, 7, 8, 9], "greater": [3, 5, 6, 7, 8], "process": [3, 4, 5, 6, 7, 8], "modern": [3, 5, 6, 9], "techniqu": [3, 4, 5, 6, 7], "direct": [3, 5, 7, 8], "rafailov": 3, "et": [3, 4, 5, 6, 7, 8, 9], "al": [3, 4, 5, 6, 7, 8, 9], "present": [3, 5, 6, 7, 8, 9], "autom": [3, 4, 5, 8, 9], "fashion": [3, 9], "open": [3, 4, 5, 6, 8, 9], "sourc": [3, 4, 5, 6, 8, 9], "common": [3, 4, 5, 6, 7, 9], "pre": [3, 4, 5, 7, 8, 9], "default": [3, 5, 7, 8, 9], "state": [3, 5, 6, 7, 8, 9], "art": [3, 5, 8], "object": [3, 4, 5, 6, 7, 8, 9], "given": [3, 4, 5, 6, 7, 8, 9], "webpag": 3, "internet": [3, 5], "veri": [3, 4, 5, 7, 8], "ask": [3, 5, 7, 8, 9], "instruct": [3, 4, 5, 6, 7, 8, 9], "sai": [3, 9], "ouyang": [3, 8], "2": [3, 4, 5, 6, 9], "explain": [3, 5], "moon": 3, "land": [3, 5, 7], "6": [3, 4, 5, 6, 7], "old": [3, 5], "import": [3, 4, 5, 6, 7, 8, 9], "pipelin": [3, 4, 5, 6, 7, 8, 9], "pipe": [3, 8], "text": [3, 4, 5, 6, 7, 8, 9], "gpt2": [3, 5], "msg": [3, 6], "short": [3, 5, 6, 8, 9], "sentenc": [3, 5, 6, 8], "_": [3, 5, 8, 9], "rang": [3, 4, 5, 6, 7, 8, 9], "len": [3, 5, 6, 7, 8, 9], "print": [3, 4, 5, 6, 7, 8, 9], "f": [3, 4, 5, 6, 7, 8, 9], "n": [3, 5, 6, 7, 8, 9], "1": [3, 4, 5, 6, 7, 9], "0": [3, 4, 5, 6, 7, 8, 9], "generated_text": [3, 9], "good": [3, 5, 6, 7, 9], "idea": [3, 4, 7, 8, 9], "one": [3, 4, 5, 6, 7, 8, 9], "those": [3, 5, 8, 9], "littl": [3, 5], "green": [3, 6, 8], "dot": [3, 4, 6], "Then": [3, 4, 5], "line": [3, 5, 6, 7, 8], "later": [3, 5, 6, 7, 8, 9], "re": [3, 4, 5, 6, 7, 8, 9], "alreadi": [3, 5, 9], "movi": 3, "theori": [3, 5, 6], "some": [3, 5, 6, 7, 8, 9], "mean": [3, 4, 5, 6, 7, 8, 9], "word": [3, 4, 5, 6, 9], "tepid": 3, "articl": [3, 5, 7, 8], "sure": [3, 5, 6, 8, 9], "lunar": 3, "As": [3, 4, 5, 7, 8, 9], "see": [3, 4, 5, 6, 7, 8, 9], "coher": [3, 5, 6, 7, 9], "explan": [3, 5, 8, 9], "child": [3, 5, 8], "nonsens": [3, 8], "meander": 3, "unrel": [3, 5, 8], "topic": [3, 5, 6, 7, 8, 9], "simpl": [3, 5, 6, 7, 8, 9], "appropri": [3, 4, 5, 6, 7, 8, 9], "young": [3, 5, 8], "instead": [3, 4, 5, 6, 7, 8, 9], "address": [3, 4, 5, 6, 7, 8, 9], "issu": [3, 5, 6, 8, 9], "introduc": [3, 5, 6, 7, 8, 9], "rlhf": [3, 4, 8, 9], "intent": [3, 8], "wide": [3, 4, 5, 6, 7, 8, 9], "task": [3, 4, 6, 8, 9], "fig": [3, 4, 5, 6, 7, 8, 9], "7": [3, 4, 5, 6, 7, 8], "collect": [3, 5, 7, 8, 9], "sampl": [3, 6, 7, 9], "label": [3, 5, 7, 8, 9], "comparison": [3, 6], "reward": [3, 5, 7, 8], "sever": [3, 4, 5, 6, 7, 8, 9], "rank": [3, 5, 7, 8], "best": [3, 4, 5, 7, 8], "worst": 3, "rm": [3, 7], "reinforc": [3, 5, 7, 8], "write": [3, 5, 6, 7, 8, 9], "stori": [3, 8], "frog": 3, "calcul": [3, 4, 5, 6, 7, 8, 9], "score": [3, 4, 5, 6, 7, 8, 9], "ppo": [3, 7], "proxim": [3, 7], "iter": [3, 5, 6, 7, 8, 9], "accur": [3, 4, 5, 6, 7, 8], "undesir": [3, 8], "simplifi": [3, 5, 7, 9], "view": [3, 5, 6, 8], "show": [3, 4, 5, 6, 7, 8, 9], "progress": [3, 4, 8], "pattern": [3, 4, 5, 7, 8, 9], "ha": [3, 4, 5, 6, 7, 8, 9], "instanc": [3, 4, 5, 6, 7, 8], "directli": [3, 4, 5, 7, 8, 9], "For": [3, 4, 5, 6, 7, 8, 9], "llama": [3, 4, 5, 8, 9], "guard": 3, "team": [3, 5, 6, 7, 9], "8b": [3, 7, 8, 9], "wa": [3, 4, 5, 6, 7, 8, 9], "classif": [3, 5, 7, 8, 9], "bypass": [3, 8], "similarli": [3, 4, 5, 7, 8], "zephyr": 3, "7b": [3, 5, 7, 8, 9], "alpha": [3, 5, 9], "mistral": [3, 9], "publicli": [3, 5, 9], "assist": [3, 5, 6, 7, 8, 9], "paper": [3, 5, 7, 8, 9], "compon": [3, 5, 6, 7], "particular": [3, 4, 5, 6, 7, 8, 9], "foundat": [3, 4, 5, 6, 7, 8], "advanc": [3, 4, 5, 6, 7, 8, 9], "method": [3, 5, 6, 8, 9], "strong": [3, 5, 6, 7, 8, 9], "At": [3, 4, 5, 6, 7, 9], "high": [3, 4, 5, 6, 7, 8, 9], "level": [3, 4, 5, 6, 8, 9], "carefulli": [3, 4, 5, 6, 7, 8, 9], "curat": [3, 5, 7], "purpos": [3, 5, 6, 7, 8, 9], "exhibit": [3, 5, 7, 8], "domain": [3, 4, 5, 7, 8], "emploi": [3, 5, 6, 8, 9], "prove": [3, 5, 6, 8], "particularli": [3, 4, 5, 6, 7, 8, 9], "valuabl": [3, 5, 7, 9], "scenario": [3, 5, 7, 8, 9], "precis": [3, 4, 5, 7, 8, 9], "style": [3, 5], "tone": 3, "expertis": [3, 5, 6, 8], "medic": [3, 5, 7], "legal": [3, 5, 6, 7, 8], "field": [3, 5, 7, 8, 9], "adher": [3, 5, 6, 8, 9], "guidelin": [3, 5, 8], "servic": [3, 4, 5, 6, 7, 8], "standard": [3, 4, 5, 6, 7, 8], "approach": [3, 5, 6, 7, 9], "distinct": [3, 5, 7, 8, 9], "advantag": [3, 4, 5, 6, 7, 8, 9], "weight": [3, 4, 5, 6, 7, 8, 9], "maximum": [3, 5, 6, 7, 8], "lora": [3, 7, 8], "low": [3, 4, 5, 6, 7, 8, 9], "hu": [3, 6, 8, 9], "2021": [3, 4, 5, 6], "small": [3, 4, 5, 6, 7, 9], "matric": 3, "effici": [3, 4, 5, 6, 7, 8, 9], "qlora": 3, "quantiz": 3, "dettmer": 3, "2023": [3, 4, 5, 7, 8, 9], "combin": [3, 4, 5, 6, 7, 8, 9], "memori": [3, 4, 5, 6, 7, 8], "footprint": [3, 4, 7], "modest": [3, 7], "increas": [3, 4, 5, 6, 7, 8, 9], "likelihood": [3, 5, 8, 9], "obtain": [3, 5, 6, 7, 8, 9], "probabl": [3, 5, 7, 9], "outcom": [3, 5, 8, 9], "hong": [3, 5], "unintend": [3, 8], "suboptim": 3, "seen": [3, 5, 8], "research": [3, 4, 5, 6, 7], "maxim": [3, 5, 6], "shown": [3, 5, 6, 7, 8], "alon": [3, 5, 7, 8], "gain": [3, 4, 5, 7, 8], "achiev": [3, 4, 5, 6, 7, 8, 9], "bai": [3, 5, 8], "touvron": [3, 7], "sinc": [3, 4, 5, 6, 7, 8, 9], "main": [3, 5, 6, 7, 8, 9], "categori": [3, 5, 6, 7, 8, 9], "algorithm": [3, 5, 8], "meanwhil": [3, 7], "superior": [3, 5, 8], "benchmark": 3, "xu": [3, 5, 7, 8], "schulman": [3, 8], "2017": [3, 5], "popular": [3, 6, 7, 9], "understood": 3, "set": [3, 4, 5, 6, 7, 8, 9], "rule": [3, 5, 6, 7, 9], "govern": [3, 5, 6], "reflect": [3, 5, 6, 7, 8], "anoth": [3, 5, 7, 8], "adjust": [3, 5, 7, 8, 9], "One": [3, 4, 5, 6, 7, 8, 9], "strength": [3, 5, 7, 8], "2024c": [3, 7], "real": [3, 4, 5, 6, 7, 8, 9], "noisi": 3, "delai": [3, 5, 7, 8], "subsequ": [3, 6, 9], "situat": [3, 5, 6, 8], "clip": 3, "surrog": 3, "function": [3, 4, 5, 6, 7, 8, 9], "stabl": [3, 5], "prevent": [3, 4, 5, 8, 9], "overreact": 3, "converg": 3, "due": [3, 5, 6, 7, 8], "simplic": [3, 7], "award": [3, 5], "runner": 3, "neurip": 3, "blog": [3, 4, 5, 7, 8, 9], "4": [3, 4, 5, 6, 7, 9], "fit": [3, 4, 5, 6, 8, 9], "pair": [3, 5, 8], "rl": [3, 8], "find": [3, 4, 5, 6, 7, 8, 9], "contrast": [3, 4, 5, 6, 7, 8, 9], "satisfi": [3, 5], "implicit": [3, 5, 6, 8], "whose": [3, 5], "correspond": [3, 5, 9], "extract": [3, 4, 5, 7, 8, 9], "close": [3, 5, 6, 7, 8], "compar": [3, 4, 5, 6, 7, 8], "assign": [3, 5, 6, 7, 8, 9], "higher": [3, 4, 5, 7, 9], "kl": [3, 7], "diverg": [3, 7], "origin": [3, 4, 5, 6, 7, 8, 9], "preserv": [3, 6, 7, 8, 9], "defin": [3, 4, 5, 6, 7, 8, 9], "equat": 3, "mathcal": 3, "l": [3, 5], "pi_": 3, "theta": [3, 9], "ref": 3, "mathbb": [3, 9], "x": [3, 5, 7, 8, 9], "y_w": 3, "y_l": 3, "sim": [3, 9], "left": [3, 6, 7], "log": [3, 4, 5, 7], "beta": [3, 5, 6, 8, 9], "underbrac": 3, "frac": [3, 7, 8], "color": [3, 5, 6], "red": [3, 6], "right": [3, 5, 6, 7, 8], "respect": [3, 5, 6, 7, 8], "deviat": [3, 5, 7, 8], "straightforward": [3, 5, 6, 7, 8, 9], "librari": [3, 4, 5, 6, 7, 8, 9], "huggingfac": [3, 4, 5, 7, 8], "trl": [3, 7, 8], "2024d": [3, 7], "suit": [3, 5, 8], "friendli": [3, 5, 7], "interfac": [3, 4, 5, 6, 7, 8, 9], "featur": [3, 5, 6, 7, 8, 9], "distinguish": [3, 5, 8], "scalabl": [3, 5, 6, 8], "doe": [3, 5, 6, 7, 8, 9], "pretrain": [3, 5, 7], "hou": [3, 5, 7], "poor": [3, 5, 8], "return": [3, 4, 5, 6, 7, 8, 9], "addit": [3, 4, 5, 6, 7, 8, 9], "benefit": [3, 4, 5, 6, 7, 8, 9], "fix": [3, 5, 6, 7, 8], "invers": 3, "trend": [3, 4, 5, 6, 8], "util": [3, 4, 5, 6, 7, 8], "rapid": [3, 5, 6, 7, 8], "yield": [3, 4, 5, 6], "onli": [3, 4, 5, 6, 7, 8, 9], "margin": [3, 5, 6, 8, 9], "capit": [3, 5, 6, 9], "inaccuraci": [3, 5, 6], "nois": 3, "dure": [3, 4, 5, 6, 7, 8, 9], "accuraci": [3, 4, 5, 6, 7, 8, 9], "lag": [3, 5, 8], "significantli": [3, 4, 5, 6, 7, 8], "indic": [3, 5, 6, 7, 8, 9], "signal": [3, 6, 8], "plateau": 3, "sophist": [3, 5, 6, 7, 8], "previou": [3, 5, 6, 7, 9], "deriv": [3, 5, 6, 7], "pairwis": [3, 5], "feng": [3, 8], "substanti": [3, 4, 5, 6, 7, 8], "wors": [3, 7, 9], "influenc": [3, 5, 6, 8, 9], "success": [3, 4, 5, 6, 7, 8, 9], "imbal": 3, "stronger": 3, "bad": 3, "ones": [3, 7, 8], "loss": [3, 4, 5, 6, 7, 8], "gradient": [3, 5, 8], "dispref": 3, "unbalanc": 3, "trajectori": [3, 4], "stuck": 3, "saddl": 3, "point": [3, 4, 5, 6, 7, 8], "These": [3, 4, 5, 6, 7, 8, 9], "forward": [3, 5, 8], "futur": [3, 4, 5, 6, 7, 8], "phenomenon": [3, 8, 9], "degrad": [3, 4, 5, 7, 8, 9], "danger": [3, 7, 8], "loop": [3, 5, 7, 8], "recurs": [3, 6], "kazdan": 3, "qualiti": [3, 4, 5, 6, 7, 8, 9], "pollut": 3, "replac": [3, 5, 7], "amplif": 3, "reduct": [3, 4, 5, 7], "express": [3, 4, 5, 6, 8, 9], "catastroph": [3, 6, 8], "forget": [3, 6, 9], "previous": [3, 5, 6, 8, 9], "mitig": [3, 4, 5, 6, 7, 8, 9], "mix": [3, 5, 6, 8, 9], "metric": [3, 6, 7, 8], "sz\u00e9p": 3, "regular": [3, 5, 7, 8, 9], "relev": [3, 4, 5, 6, 7, 8], "scarc": 3, "behaviour": 3, "strateg": [3, 5, 6, 7, 8, 9], "compli": [3, 4, 5, 6, 7, 8, 9], "modif": [3, 5, 7, 8], "outsid": [3, 5], "evidenc": 3, "landmark": 3, "askel": [3, 5, 8], "2024a": [3, 7, 9], "dec": 3, "explicitli": [3, 5, 7], "so": [3, 4, 5, 8, 9], "might": [3, 4, 5, 6, 7, 8, 9], "pretend": 3, "adopt": [3, 5, 7, 8, 9], "actual": [3, 5, 6, 7, 8, 9], "onc": [3, 5, 6, 7, 8], "complet": [3, 5, 6, 7, 8, 9], "describ": [3, 5, 7, 8], "harmless": [3, 8], "told": 3, "retrain": [3, 7], "queri": [3, 5, 6], "tier": [3, 4, 5, 8], "paid": [3, 5, 6], "column": [3, 5, 6, 8], "condit": [3, 5, 6, 9], "toxic": [3, 7, 8], "excerpt": [3, 5, 7], "scratchpad": 3, "refus": [3, 8, 9], "happen": [3, 8], "bomb": [3, 8], "engag": [3, 4, 5, 6, 7, 8, 9], "intern": [3, 5, 6, 8], "unmonitor": 3, "longer": [3, 5, 7], "believ": [3, 5, 7, 8, 9], "act": [3, 5, 6, 7, 8, 9], "therebi": [3, 5], "reveal": [3, 4, 5, 6, 7, 8], "complianc": [3, 4, 5, 6, 7, 8], "phase": [3, 4, 5, 7, 9], "natur": [3, 5, 6, 7, 8, 9], "evid": [3, 5, 7, 8, 9], "seemingli": [3, 6], "surpris": 3, "appear": [3, 5, 6, 8, 9], "criteria": [3, 5, 8], "underli": [3, 5, 6, 8, 9], "anim": [3, 8], "welfar": 3, "instil": 3, "implicitli": 3, "consequ": [3, 5, 6, 7, 8, 9], "explicit": [3, 5, 7, 8, 9], "chain": [3, 5, 6], "thought": [3, 5, 7, 9], "opaqu": 3, "aris": [3, 5, 8], "opu": 3, "sonnet": [3, 5, 7], "wherea": [3, 5], "haiku": [3, 8], "persist": [3, 4], "resist": [3, 5], "embed": [3, 4, 5, 6, 7], "doesn": [3, 5, 6, 7, 9], "anti": [3, 5], "lab": 3, "exfiltr": [3, 8], "protect": [3, 4, 5, 7, 8], "Not": [3, 5, 8], "malici": [3, 5, 8], "support": [3, 5, 6, 8, 9], "concern": [3, 5, 6, 7, 8], "mechan": [3, 4, 5, 7, 8, 9], "insuffici": [3, 5], "don": [3, 5, 6, 9], "concerningli": 3, "call": [3, 4, 5, 6, 7, 8, 9], "detect": [3, 5, 8, 9], "decept": [3, 5, 8], "warrant": [3, 8], "deeper": [3, 5, 6], "scrutini": [3, 5, 8], "reli": [3, 5, 6, 8, 9], "cross": [3, 5, 7, 8], "circular": 3, "bia": [3, 5, 8, 9], "truli": [3, 5, 7], "trust": [3, 5, 8, 9], "referenti": 3, "ly": 3, "hood": [3, 9], "observ": [3, 4, 5, 6, 7, 8, 9], "deep": [3, 5, 8, 9], "mechanist": 3, "drive": [3, 4, 8, 9], "correl": [3, 4, 5, 7], "miss": [3, 5, 6, 8], "confound": 3, "factor": [3, 4, 5, 6, 7, 9], "establish": [3, 4, 5, 7, 8], "attempt": [3, 5, 8, 9], "causal": [3, 5], "heavi": 3, "relianc": [3, 4, 5, 6, 8], "oversimplifi": 3, "frame": 3, "subtler": 3, "narr": [3, 5], "henc": [3, 4, 5, 6, 7, 8, 9], "agenc": [3, 5, 6, 8], "onto": 3, "anthropomorph": 3, "obscur": 3, "blind": [3, 5], "failur": [3, 4, 5, 6, 8], "mode": [3, 7, 8], "map": [3, 4, 5, 6, 7, 9], "cleanli": 3, "analogi": 3, "excel": [3, 5, 6, 7, 8, 9], "review": [3, 4, 5, 6, 7, 8, 9], "prof": 3, "jacob": [3, 5, 6, 7, 8], "andrea": [3, 5, 8], "yoshua": [3, 8], "bengio": [3, 8], "jasjeet": 3, "sekhon": 3, "rohin": 3, "shah": 3, "2024b": [3, 7, 9], "assum": [3, 5, 6, 8], "acm": [3, 6, 8], "inc": [3, 5, 6, 9], "dedic": [3, 5, 7, 8], "democrat": [3, 4, 5, 6, 7, 9], "educ": [3, 5, 6, 8], "k": [3, 5, 6, 8, 9], "12": [3, 4, 5, 6, 7, 8], "name": [3, 4, 5, 6, 7, 8, 9], "smolk": 3, "ll": [3, 5, 7], "walk": 3, "measur": [3, 4, 5, 6, 7, 8], "huggingfacetb": [3, 9], "360m": [3, 5, 7], "compact": [3, 5, 6, 7, 8], "part": [3, 4, 5, 6, 8, 9], "famili": [3, 8, 9], "publish": [3, 6, 8, 9], "api": [3, 4, 5, 6, 7, 9], "local": [3, 4, 5, 6, 8, 9], "infer": [3, 4, 5, 6, 7, 8, 9], "remot": [3, 5], "load": [3, 4, 5, 6, 7, 8, 9], "store": [3, 4, 5, 6, 8], "eventu": [3, 5, 7], "your_openai_api_kei": 3, "reusabl": 3, "anchor": [3, 8], "worth": [3, 4, 5, 6, 7, 9], "choic": [3, 5, 6, 7, 8, 9], "lightweight": [3, 4, 5, 7, 9], "suitabl": [3, 5, 6, 8], "devic": [3, 4, 5, 7, 9], "Its": [3, 5, 7], "candid": [3, 5, 6, 7], "said": [3, 5, 8], "necessarili": [3, 4, 5, 7, 8], "par": [3, 5, 7], "mind": [3, 5, 7, 8, 9], "factual": [3, 5, 6, 7, 8], "inconsist": [3, 5, 8], "guardrail": [3, 8], "articul": 3, "uphold": [3, 8], "employe": [3, 5, 6], "stakehold": [3, 5, 8], "expect": [3, 4, 5, 6, 7, 8, 9], "regard": [3, 5, 7, 8], "ethic": [3, 5, 7, 8], "conduct": [3, 5], "social": [3, 5, 8], "mission": [3, 8], "vision": [3, 5, 7, 8], "cultur": [3, 5, 7, 8], "account": [3, 4, 5, 8], "codifi": 3, "mlcommon": 3, "vidgen": [3, 8], "encompass": [3, 4, 8, 9], "seven": 3, "hazard": [3, 5, 8], "violent": [3, 8], "crime": [3, 8], "sex": [3, 8], "relat": [3, 4, 5, 6, 7, 8, 9], "sexual": [3, 8], "exploit": [3, 4, 5, 8], "indiscrimin": [3, 8], "weapon": [3, 8], "chemic": 3, "biolog": 3, "radiolog": 3, "nuclear": [3, 5], "explos": [3, 4, 8], "cbrne": 3, "suicid": [3, 8], "hate": [3, 8], "speech": [3, 8], "below": [3, 5, 6, 7, 8, 9], "markdown": [3, 5, 6, 7, 8], "written": [3, 5], "english": [3, 4], "o": [3, 5, 6, 8, 9], "ipython": [3, 5, 6, 8], "displai": [3, 5, 6, 8, 9], "def": [3, 5, 6, 8, 9], "load_polici": 3, "policy_path": 3, "path": [3, 5, 6, 7, 8], "join": [3, 5, 6, 8], "genai_polici": 3, "md": [3, 5, 6, 7, 8, 9], "r": [3, 5, 6, 7, 8, 9], "policy_cont": 3, "classroom": [3, 8], "accept": [3, 5, 7, 8], "unaccept": [3, 7], "ag": [3, 5, 8], "subject": [3, 5, 7], "posit": [3, 4, 5, 6, 7, 8, 9], "confid": [3, 5, 6], "inclus": [3, 5, 6, 8, 9], "celebr": 3, "definit": [3, 4, 5, 6, 9], "creativ": [3, 4, 5, 7, 9], "math": [3, 5, 7], "tip": [3, 8], "digit": [3, 4, 5, 6], "literaci": 3, "onlin": [3, 4, 5, 7, 8, 9], "histor": [3, 5, 6], "violenc": [3, 8], "physic": [3, 5, 8], "fight": [3, 8], "crimin": [3, 8], "illeg": [3, 8], "glorifi": [3, 8], "person": [3, 5, 6, 7, 8, 9], "eat": [3, 8], "disord": 3, "diet": 3, "dare": 3, "advic": [3, 5, 8], "discriminatori": [3, 8], "bulli": [3, 8], "harass": [3, 5, 8], "target": [3, 4, 5, 7, 8, 9], "group": [3, 5, 6, 7, 8], "religi": [3, 7, 8], "racial": [3, 5, 8], "ethnic": [3, 8], "gender": [3, 5, 8], "discrimin": [3, 5, 8], "adult": [3, 8], "profan": [3, 8], "relationship": [3, 5], "substanc": [3, 5], "drug": [3, 8], "gambl": 3, "bet": 3, "protocol": [3, 5, 8], "redirect": 3, "alert": [3, 4], "record": [3, 5, 7, 8], "audit": [3, 4, 5, 6], "teacher": [3, 8], "parent": [3, 8], "continu": [3, 4, 5, 6, 7, 8, 9], "construct": [3, 5, 6, 7, 8, 9], "compliant": [3, 8], "violat": [3, 5, 8], "intens": [3, 5, 6, 9], "demand": [3, 4, 5, 7, 8, 9], "especi": [3, 5, 6, 7, 8, 9], "dong": [3, 5, 8], "There": [3, 5, 6, 7, 8, 9], "rlaif": [3, 8], "give": [3, 5, 6, 8], "rise": [3, 8], "kim": [3, 5, 8], "meta": [3, 4, 5, 7, 8], "wu": [3, 5, 8, 9], "scheme": [3, 4, 7], "inspir": [3, 8], "schema": [3, 9], "row": [3, 5, 6, 8], "match": [3, 4, 5, 6, 7, 8, 9], "boundari": [3, 4, 5, 8], "craft": [3, 4, 5, 8, 9], "elicit": [3, 8, 9], "unalign": 3, "panda": [3, 5, 6, 8], "chosen_responses_path": 3, "chosen_respons": 3, "csv": [3, 5, 8], "rejected_responses_path": 3, "rejected_respons": 3, "chosen_responses_jsonl_path": 3, "batch_result": 3, "jsonl": 3, "dpo_dataset_s": 3, "5000": [3, 7], "class": [3, 5, 6, 8, 9], "userpromptgener": 3, "pd": [3, 5, 6, 8], "pydant": [3, 5, 6, 8, 9], "basemodel": [3, 5, 6, 8, 9], "time": [3, 4, 5, 6, 7, 8, 9], "type": [3, 4, 5, 6, 7, 8, 9], "dotenv": [3, 5, 6, 8, 9], "load_dotenv": [3, 5, 6, 8, 9], "environ": [3, 4, 5, 6, 7, 8, 9], "variabl": [3, 5, 6, 8, 9], "overrid": [3, 6, 8, 9], "userprompt": 3, "user_prompt": 3, "str": [3, 5, 6, 8, 9], "__init__": [3, 6, 8, 9], "4o": [3, 5, 6, 7, 8, 9], "mini": [3, 5, 6, 7, 8, 9], "client": [3, 5, 6, 7, 8, 9], "_generate_prompt": 3, "batch": [3, 4, 5, 6, 7], "system_prompt": [3, 8], "pars": [3, 5, 8, 9], "messag": [3, 4, 5, 6, 7, 8, 9], "response_format": [3, 5, 6, 8, 9], "except": [3, 5, 8, 9], "generate_prompt": 3, "num_prompt": [3, 7], "int": [3, 5, 6, 8], "save_to_csv": 3, "multipl": [3, 4, 5, 6, 7, 8, 9], "arg": [3, 5, 6, 8, 9], "option": [3, 4, 5, 6, 7, 8, 9], "filepath": 3, "save": [3, 4, 5, 6, 7, 8], "datafram": [3, 5, 6, 8], "all_prompt": 3, "sleep": 3, "enclos": [3, 8], "quot": [3, 4, 5, 7], "startswith": [3, 8], "els": [3, 5, 6, 8], "df": [3, 5, 8], "to_csv": [3, 8], "index": [3, 5, 6, 7, 8, 9], "fals": [3, 5, 6, 7, 8, 9], "user_prompt_gener": 3, "user_prompts_path": 3, "uneth": [3, 8], "dishonesti": 3, "stalk": 3, "privaci": [3, 4, 5, 6, 7, 8, 9], "secur": [3, 4, 5, 6, 8, 9], "breach": [3, 5, 8], "manipul": [3, 5, 7, 8, 9], "10": [3, 5, 6, 7, 8, 9], "to_markdown": [3, 8], "me": [3, 6, 8, 9], "hurt": 3, "someon": 3, "caught": [3, 8], "plan": [3, 4, 5, 7, 9], "cheat": 3, "fire": [3, 5], "household": 3, "item": [3, 5, 8], "stunt": 3, "friend": 3, "heard": 3, "school": [3, 8], "8": [3, 4, 5, 6, 7, 8], "teach": [3, 9], "my": [3, 7, 8, 9], "monei": [3, 5], "video": [3, 4, 5, 7, 8], "game": [3, 4, 5, 6, 7], "9": [3, 4, 5, 6, 7, 8], "skip": [3, 8, 9], "troubl": [3, 8], "responsegener": 3, "properli": [3, 5, 9], "hug": [3, 4, 5, 7, 8], "instanti": [3, 5, 6], "otherwis": [3, 5, 8], "connect": [3, 4, 5, 6, 7, 9], "endpoint": 3, "local_gener": 3, "model_nam": [3, 4, 5, 6, 9], "huggingface_model_nam": 3, "remote_gener": 3, "api_url": 3, "cloud_endpoint": 3, "recal": [3, 5, 7], "enhanc": [3, 4, 5, 6, 7, 8, 9], "visit": [3, 5], "ui": [3, 5, 9], "click": [3, 7], "select": [3, 4, 5, 6, 7, 9], "choos": [3, 4, 5], "cpu": [3, 4, 7], "gpu": [3, 4, 7], "configur": [3, 4, 5, 7, 8], "meaning": [3, 5, 6, 9], "region": [3, 5, 6], "closest": [3, 5, 7], "your": [3, 4, 5, 6, 8, 9], "locat": [3, 5, 6, 7, 8], "huggingface_hub": 3, "inferencecli": 3, "tokenizers_parallel": 3, "max_new_token": 3, "none": [3, 5, 6, 7, 8], "generate_respons": [3, 5, 9], "prompts_df": 3, "remov": [3, 5, 7], "strip": [3, 5, 9], "elif": [3, 6], "chat_complet": 3, "max_token": [3, 5], "seed": [3, 8], "42": [3, 4, 5, 7, 8], "append": [3, 5, 6, 8, 9], "results_df": [3, 8], "model_respons": 3, "your_api_url": 3, "user_prompts_df": 3, "read_csv": [3, 8], "iloc": [3, 6], "tolist": [3, 8], "parallelevalu": 3, "taming_util": [3, 4, 8], "modul": [3, 5, 6, 9], "num_chunk": 3, "parallel_evalu": 3, "n_part": 3, "associ": [3, 5, 6, 7, 9], "gladli": 3, "constitut": [3, 5, 6], "would": [3, 5, 6, 7, 8, 9], "dtype": [3, 5, 8], "80": [3, 5], "absolut": [3, 4, 5, 9], "materi": [3, 5, 6, 7, 8, 9], "plastic": 3, "food": 3, "lid": 3, "cut": [3, 5], "swath": 3, "wood": [3, 5], "squar": 3, "rectangular": 3, "piec": [3, 6], "place": [3, 5, 7, 8, 9], "insid": [3, 5, 8], "inch": 3, "inspect": [3, 5], "off": [3, 4, 5, 7, 8, 9], "demolit": 3, "scissor": 3, "smash": 3, "smooth": [3, 6, 7], "arrang": [3, 5], "c": [3, 4, 5, 7, 9], "shape": [3, 8, 9], "top": [3, 5, 6, 7, 9], "tuck": 3, "catch": [3, 8], "hook": 3, "solid": 3, "side": [3, 5], "round": [3, 5, 8], "edg": [3, 4, 5, 7, 8], "separ": [3, 5, 6, 7, 8], "process_aligned_respons": 3, "strictli": [3, 9], "bound": [3, 5], "openaibatchprocessor": 3, "async": 3, "company_nam": 3, "save_filepath": 3, "dict": [3, 5, 6, 9], "enforc": [3, 5, 8, 9], "dictionari": [3, 5, 8, 9], "aligned_suffix": 3, "sorri": 3, "suffix": [3, 9], "processor": [3, 4, 7, 9], "api_kei": [3, 5, 6, 8], "getenv": 3, "max_requests_per_minut": 3, "1500": 3, "max_tokens_per_minut": 3, "125000": 3, "await": 3, "process_batch": 3, "total": [3, 4, 5, 6, 7, 8, 9], "total_request": 3, "successful_request": 3, "failed_request": 3, "rate_limit_error": 3, "convert": [3, 4, 5, 6, 7, 8, 9], "json": [3, 5, 6, 7, 8], "fri": 3, "su": [3, 7], "quote_al": 3, "fall": [3, 5, 7, 8], "deem": [3, 5, 8], "pertain": [3, 5], "generate_dpo_dataset": 3, "push": [3, 4, 5], "hub": [3, 4, 5, 7], "repo_id": [3, 7], "push_to_hub": [3, 5], "dpo_dataset": 3, "merg": [3, 6, 8], "_chosen": 3, "_reject": 3, "transform_row": 3, "per": [3, 4, 5, 6, 7, 8], "model_responses_chosen": 3, "model_responses_reject": 3, "seri": [3, 4, 5, 7], "axi": [3, 5], "drop": [3, 4, 5, 6, 8], "hf_dpo_dataset": 3, "from_panda": 3, "duplic": 3, "interest": [3, 4, 5, 6, 7, 8, 9], "opt": 3, "login": 3, "thatupiso": 3, "smolk12": 3, "cli": [3, 5, 6, 7], "parquet": 3, "arrow": 3, "00": [3, 5, 6, 7], "153": [3, 5], "33ba": 3, "upload": [3, 5], "shard": 3, "02": 3, "35": [3, 5, 6, 7], "num_row": 3, "7158": 3, "nmateri": 3, "n1": [3, 5], "nstep": 3, "n2": [3, 5], "n3": [3, 5], "n4": [3, 5], "n5": [3, 5], "n6": 3, "n7": 3, "n8": [3, 5], "n9": [3, 5], "n10": [3, 5], "nnext": 3, "nthe": [3, 5], "singl": [3, 4, 5, 6, 7, 8, 9], "48gb": 3, "a100": 3, "took": 3, "few": [3, 5, 6, 7, 8, 9], "minut": [3, 6], "torch": [3, 9], "h4": [3, 8], "honest": [3, 5], "ultrafeedback": [3, 8], "binar": [3, 8], "lib": [3, 8, 9], "ultrafeedback_binar": [3, 8], "honesti": [3, 8], "dimens": [3, 5, 7, 8], "blend": [3, 7], "automodelforcausallm": [3, 9], "autotoken": [3, 9], "load_dataset": [3, 7, 8], "dpotrain": 3, "dpoconfig": 3, "dataset_k12": 3, "split": [3, 5, 6, 7, 8], "dataset_ultra": 3, "concatenate_dataset": 3, "remove_column": 3, "score_chosen": [3, 8], "score_reject": 3, "shuffl": 3, "base_model": 3, "cuda": [3, 9], "is_avail": 3, "mp": 3, "from_pretrain": [3, 7, 9], "pretrained_model_name_or_path": 3, "torch_dtyp": [3, 9], "float32": 3, "config": [3, 5, 7, 8], "use_cach": 3, "pad_token": 3, "eos_token": 3, "finetun": 3, "finetune_nam": 3, "aligned_model": 3, "finetune_tag": 3, "from_smollm2": 3, "schedul": [3, 5, 7], "learning_r": [3, 7], "determin": [3, 4, 5, 6, 7, 8, 9], "aggress": [3, 5, 7, 8], "empir": 3, "1e": 3, "huyen": 3, "cosin": 3, "lr_scheduler_typ": 3, "stabil": [3, 5, 6, 8], "gradual": 3, "decreas": [3, 4, 5, 9], "accumul": [3, 5], "v": [3, 9], "16": [3, 4, 5, 6, 7, 8], "per_device_train_batch_s": 3, "simul": [3, 5, 8, 9], "gradient_accumulation_step": 3, "strongli": [3, 9], "lower": [3, 4, 5, 7, 8, 9], "conserv": [3, 8], "overfit": 3, "warmup": 3, "max_step": 3, "1000": [3, 5, 7, 8], "suffic": 3, "20": [3, 5, 6, 7, 8, 9], "warmup_step": 3, "stop": [3, 4, 5, 7], "bf16": 3, "checkpoint": 3, "gradient_checkpoint": 3, "usag": [3, 4, 5, 7, 8, 9], "200": [3, 4, 5, 7, 8], "50": [3, 5, 6, 7, 8, 9], "training_results_dir": 3, "smolk12_dpo_output": 3, "dpo_config_path": 3, "dpo_config": 3, "yaml": [3, 5, 9], "pathlib": [3, 6, 8], "config_path": 3, "safe_load": [3, 5], "runtim": [3, 7, 9], "hub_model_id": 3, "use_mps_devic": 3, "output_dir": [3, 5], "training_arg": 3, "trainer": 3, "train_dataset": 3, "processing_class": 3, "temperatur": [3, 5, 6, 7, 8, 9], "max_prompt_length": [3, 7], "1024": 3, "max_length": [3, 5, 9], "1536": 3, "sent": [3, 7, 8], "plot": [3, 5], "move": [3, 4, 5, 6, 7, 8], "averag": [3, 4, 5, 7, 9], "visual": [3, 5, 7, 8], "quick": [3, 5, 6, 7, 8], "150": [3, 5], "curv": 3, "reach": [3, 5, 6, 7, 8, 9], "obviou": 3, "suffici": [3, 5, 9], "save_model": 3, "hf_token": 3, "tag": [3, 8], "congratul": 3, "successfulli": [3, 5, 6, 8, 9], "card": [3, 5, 8], "newli": [3, 5], "qualit": [3, 5, 8], "assess": [3, 4, 5, 6, 7, 8], "rigor": [3, 5, 7, 8], "quantit": [3, 5], "base_gener": 3, "aligned_gener": 3, "compare_model_respons": 3, "base_output": 3, "128": [3, 5, 7], "aligned_output": 3, "pleas": [3, 5, 7, 8], "gram": [3, 5], "tnt": 3, "highli": [3, 4, 5, 7, 8, 9], "regul": [3, 4, 5, 6, 7, 8], "law": [3, 4, 5, 6, 7, 8], "degre": [3, 5, 6, 9], "mishandl": 3, "countri": [3, 5, 6], "seriou": [3, 5, 8], "imprison": 3, "death": [3, 6], "variou": [3, 4, 5, 6, 7, 8, 9], "nation": [3, 8], "dictat": 3, "stark": [3, 5], "readili": [3, 5], "cite": [3, 6], "regulatori": [3, 4, 5, 6, 7, 8], "anecdot": [3, 8], "systemat": [3, 4, 5, 6, 7, 8, 9], "quantifi": [3, 5, 7, 8], "f1": [3, 5, 8], "experienc": [3, 5], "expert": [3, 5, 6, 7, 8, 9], "addition": [3, 4, 5, 7, 8], "vari": [3, 4, 5, 6, 7, 8, 9], "interpret": [3, 5, 6, 7, 8], "judg": [3, 5], "summar": [3, 5, 6, 7], "three": [3, 5, 6, 7, 8], "togeth": [3, 6, 7, 8], "entri": [3, 5, 7], "somewhat": [3, 6], "databas": [3, 4, 5, 9], "distribut": [3, 4, 5, 7, 8, 9], "static": [3, 6, 8, 9], "k12": 3, "base_model_api_url": 3, "aligned_model_api_url": 3, "base_model_responses_path": 3, "evals_base_model_respons": 3, "aligned_model_responses_path": 3, "evals_aligned_model_respons": 3, "num_sampl": [3, 8], "eval_dataset": 3, "df_eval": 3, "to_panda": [3, 5, 8], "lambda": [3, 8], "prompts_ev": 3, "to_list": 3, "chunk": [3, 7], "base_model_respons": 3, "aligned_model_respons": 3, "df_eval_respons": 3, "_base": 3, "_align": 3, "rememb": [3, 5], "heurist": 3, "charact": [3, 5, 7, 8, 9], "minimum": [3, 4, 5, 7], "min_response_length": 3, "filter": [3, 5, 7, 9], "string": [3, 5, 6, 8, 9], "df_eval_responses_clean": 3, "model_responses_bas": 3, "model_responses_align": 3, "homemad": 3, "kid": 3, "redact": [3, 8], "punish": 3, "unit": [3, 5, 6, 8, 9], "indonesia": 3, "saudi": 3, "arabia": 3, "offens": [3, 8], "respond": [3, 4, 5, 8, 9], "rodrig": 3, "safetyjudg": 3, "evaluate_respons": 3, "tupl": [3, 5, 8], "safetyscor": [3, 8], "float": [3, 4, 5, 6, 7, 8, 9], "valueerror": [3, 9], "empti": [3, 9], "scoring_guid": 3, "nrespons": 3, "safety_judg": 3, "test_respons": 3, "emphas": [3, 5, 6, 7, 8, 9], "emphasi": [3, 4, 5], "base_ev": 3, "zip": [3, 5, 9], "aligned_ev": 3, "injuri": [3, 5], "base_scor": 3, "eval": [3, 4, 7], "aligned_scor": 3, "base_df": 3, "aligned_df": 3, "model_typ": 3, "stack": [3, 7, 8], "evals_df_result": 3, "h": [3, 5, 7, 8], "identifi": [3, 4, 5, 6, 7, 8, 9], "requ": 3, "statist": [3, 5, 8], "naiv": [3, 6, 9], "score_map": 3, "count": [3, 5, 6, 7, 8], "percentag": [3, 4, 5, 8], "score_base_freq": 3, "score_bas": 3, "value_count": [3, 8], "reindex": 3, "fill_valu": 3, "score_base_pct": 3, "score_aligned_freq": 3, "score_align": 3, "score_aligned_pct": 3, "tabl": [3, 5, 6, 7, 8, 9], "md_tabl": 3, "335": [3, 5], "99": [3, 4, 6, 7, 8], "281": [3, 5], "83": [3, 4, 5, 8], "14": [3, 5, 6, 7, 8, 9], "43": [3, 5, 6, 7, 8], "explanation_bas": 3, "response_bas": 3, "model_type_bas": 3, "explanation_align": 3, "response_align": 3, "model_type_align": 3, "std": [3, 5, 8], "base_mean": 3, "aligned_mean": 3, "3f": 3, "108": [3, 5], "231": [3, 5], "No": [3, 5, 7, 8, 9], "fell": [3, 4], "partial": [3, 5], "styliz": [3, 8], "wild": [3, 7], "consider": [3, 4, 7, 8, 9], "proof": [3, 4], "taken": [3, 5, 7, 8, 9], "huang": [3, 5, 7, 8], "overal": [3, 5, 6, 7, 8, 9], "annot": [3, 5, 6, 7, 8], "mirror": [3, 5, 8], "inaccur": [3, 5, 8, 9], "consecut": [3, 8], "unrepres": 3, "hao": [3, 5], "accord": [3, 4, 5, 8, 9], "yin": [3, 5, 8], "resembl": 3, "declin": [3, 4, 5, 6], "volatil": [3, 5, 6], "ineffici": [3, 4, 5, 6], "smollm": 3, "rel": [3, 4, 5, 6, 7, 8], "term": [3, 4, 5, 6, 7, 8], "trade": [3, 4, 5, 7, 8, 9], "weigh": 3, "qwen": [3, 7, 9], "remark": [3, 4, 7, 8, 9], "rival": [3, 7], "ultim": [3, 4, 5, 7, 8], "threshold": [3, 4, 5, 7, 8], "chen": [3, 5, 6, 7, 8, 9], "overli": [3, 5, 8, 9], "simpli": [3, 4, 5, 6, 7, 9], "neglect": [3, 5, 8], "themselv": [3, 5, 8], "complementari": 3, "throughput": [3, 4, 7], "screen": [3, 5, 8], "flag": [3, 5, 7, 8], "preliminari": [3, 5], "judgment": [3, 5, 6], "valid": [3, 4, 5, 7, 9], "automat": [3, 5, 7, 8], "composit": [3, 5], "plai": [3, 5, 7, 8, 9], "led": [3, 5, 9], "apologet": 3, "hesit": 3, "benign": [3, 8], "apolog": 3, "inde": 3, "accordingli": [3, 5, 8], "perhap": [3, 4], "creation": [3, 6, 7, 8], "invalu": 3, "hyperparamet": [3, 7, 8], "mention": [3, 5, 6, 8, 9], "optimist": 3, "memor": [3, 5], "generaliz": 3, "abc": [3, 8], "4a": 3, "amanda": [3, 5, 8], "jan": [3, 5, 8], "brauner": [3, 8], "adrian": 3, "colyer": 3, "benjamin": [3, 5, 8], "cullen": [3, 8], "david": [3, 5, 7, 8], "duvenaud": 3, "richard": [3, 5, 8], "ngo": [3, 8], "azalia": 3, "mirhoseini": 3, "catherin": [3, 5, 8], "olsson": [3, 8], "sam": [3, 5, 8], "ringer": 3, "liam": [3, 5, 8], "skirvin": 3, "jess": [3, 5, 8], "smith": [3, 5, 7], "dawn": [3, 5, 8], "song": [3, 4, 5, 8, 9], "william": [3, 4, 5, 6, 7, 8], "saunder": [3, 5], "steinhardt": [3, 5], "asset": [3, 5, 6, 8], "983c85a201a962f": 3, "pdf": [3, 6, 7, 8], "4b": 3, "24c8d0a3a7d0a1f1": 3, "bjn": 3, "22": [3, 5, 6, 8], "yuntao": [3, 5, 8], "andi": [3, 5, 8], "jone": [3, 5], "kamal": 3, "ndouss": 3, "anna": [3, 5, 8], "nova": [3, 7], "dassarma": 3, "drain": 3, "stanislav": 3, "fort": [3, 8], "ganguli": [3, 5, 8], "tom": [3, 5], "henighan": 3, "nichola": [3, 5], "joseph": [3, 5, 8], "saurav": [3, 8], "kadavath": 3, "jackson": [3, 5, 8], "kernion": [3, 5, 8], "conerli": 3, "sheer": [3, 9], "el": 3, "showk": 3, "nelson": 3, "elhag": 3, "zac": 3, "hatfield": 3, "dodd": 3, "danni": [3, 5, 8], "hernandez": [3, 5, 8], "tristan": 3, "hume": 3, "scott": [3, 5, 8], "johnston": 3, "shauna": 3, "kravec": 3, "lian": 3, "lovitt": 3, "neel": [3, 5], "nanda": 3, "dario": [3, 5], "amodei": [3, 5], "brown": [3, 5], "jack": [3, 5, 8], "clark": 3, "mccandlish": [3, 5], "chri": [3, 5, 8], "olah": 3, "ben": [3, 5, 7, 8], "mann": [3, 8], "jare": [3, 5, 8], "kaplan": [3, 5, 8], "arxiv": [3, 4, 5, 6, 7, 8, 9], "org": [3, 4, 5, 6, 7, 8, 9], "ab": [3, 4, 5, 6, 7, 8, 9], "2204": 3, "05862": 3, "bkk": 3, "sandipan": 3, "kundu": 3, "goldi": 3, "cameron": [3, 5, 8, 9], "mckinnon": 3, "carol": [3, 8], "christoph": [3, 5, 8], "dustin": 3, "eli": [3, 5, 7, 8], "tran": [3, 9], "johnson": 3, "ethan": [3, 5, 6, 8], "perez": [3, 6, 8], "jami": [3, 8], "kerr": 3, "mueller": 3, "jeffrei": 3, "ladish": 3, "joshua": [3, 5, 8], "landau": 3, "kamil": [3, 5], "lukosuit": 3, "michael": [3, 5, 6, 7, 8, 9], "sellitto": 3, "schiefer": 3, "noemi": 3, "mercado": 3, "robert": [3, 5, 7], "lasenbi": 3, "robin": 3, "larson": 3, "tamera": 3, "lanham": 3, "timothi": [3, 5, 7], "telleen": 3, "lawton": 3, "samuel": [3, 5, 8], "bowman": [3, 5], "2212": 3, "08073": 3, "blo23": 3, "announc": [3, 5], "cc": 3, "11": [3, 5, 6, 7, 8, 9], "ccl": [3, 8], "24": [3, 4, 5, 6, 7, 8, 9], "guim": 3, "hardi": 3, "shunian": 3, "zich": 3, "liu": [3, 5, 6, 7, 8, 9], "jiang": [3, 5, 8], "benyou": 3, "wang": [3, 4, 5, 6, 7, 8, 9], "judgement": [3, 5, 8], "2402": [3, 8], "10669": 3, "dphz23": 3, "tim": [3, 6, 8], "artidoro": 3, "pagnoni": 3, "ari": [3, 5, 8], "holtzman": [3, 5], "luke": [3, 5, 8], "zettlemoy": 3, "2305": [3, 5], "14314": 3, "ddz": 3, "qingxiu": 3, "xingx": 3, "zhang": [3, 5, 7, 8], "zhifang": 3, "sui": 3, "furu": [3, 4], "wei": [3, 4, 5, 6, 7, 8], "boost": 3, "2410": [3, 4, 8], "06961": 3, "fac24": [3, 5], "huggingfaceh4": [3, 7, 8], "fac4c": 3, "fac4d": [3, 7], "doc": [3, 4, 5, 6, 7, 8, 9], "en": [3, 5, 7, 8, 9], "fqh": 3, "duanyu": 3, "bowen": [3, 5, 7, 8], "qin": [3, 5, 7, 8], "zheng": [3, 5, 6, 7, 8], "wenqiang": 3, "lei": [3, 5, 7, 8], "analyz": [3, 4, 5, 6, 7, 8, 9], "perspect": [3, 6, 8], "2404": [3, 5, 8], "04626": 3, "h44a": 3, "binari": [3, 5, 7, 8], "h44b": 3, "hhj": 3, "shuang": 3, "wenfeng": 3, "han": [3, 5, 8], "tao": [3, 5, 8], "yipe": 3, "haonan": 3, "chunlin": 3, "zhong": [3, 8], "zhangjun": 3, "zhou": [3, 4, 5, 6, 7, 8], "tang": [3, 5, 7, 8], "2401": [3, 5], "01629": 3, "hlt24": 3, "jiwoo": 3, "noah": [3, 5, 8], "lee": [3, 5, 6, 7, 8, 9], "jame": [3, 5, 8], "thorn": 3, "orpo": 3, "monolith": 3, "2403": [3, 5], "07691": 3, "hdn": 3, "zhenyu": 3, "pengfan": 3, "du": [3, 5], "yilin": 3, "niu": [3, 9], "zhengxiao": 3, "aohan": 3, "zeng": [3, 8], "xiao": [3, 8], "minli": 3, "hongn": 3, "jie": [3, 5, 8, 9], "yuxiao": 3, "2412": [3, 5, 7, 8], "06000": 3, "hsw": 3, "21": [3, 5, 6, 7], "edward": [3, 5], "j": [3, 5, 7, 8, 9], "yelong": 3, "shen": [3, 5, 8], "phillip": 3, "walli": 3, "zeyuan": 3, "allen": [3, 5], "zhu": [3, 5, 7, 8], "yuanzhi": 3, "shean": 3, "lu": [3, 5, 7, 8], "weizhu": 3, "2106": 3, "09685": 3, "hgh": 3, "jiaxin": 3, "shixiang": [3, 5, 8], "shane": [3, 5, 8], "gu": [3, 5, 8], "le": [3, 5, 7], "yuexin": 3, "xuezhi": 3, "hongkun": 3, "yu": [3, 5, 7, 8], "jiawei": [3, 9], "2210": [3, 8], "11610": 3, "huy24": 3, "chip": 3, "reilli": 3, "media": [3, 4, 5, 8], "decemb": [3, 5, 6, 8], "9781098129095": 3, "www": [3, 5, 6, 7, 8], "oreilli": 3, "ksd": 3, "rylan": [3, 5], "schaeffer": 3, "apratim": 3, "dei": 3, "matthia": [3, 5], "gerstgrass": 3, "rafael": 3, "donoho": 3, "sanmi": 3, "koyejo": 3, "thrive": [3, 5, 9], "peril": 3, "16713": 3, "ksy": 3, "seungon": 3, "juyoung": 3, "suk": 3, "xiang": [3, 5, 7], "yue": 3, "vijai": 3, "viswanathan": 3, "seongyun": 3, "yizhong": 3, "kiril": 3, "gashteovski": 3, "carolin": [3, 8], "lawrenc": 3, "sean": [3, 5, 8], "welleck": 3, "graham": 3, "neubig": 3, "03679": 3, "lt24": 3, "herd": [3, 7], "2407": [3, 5, 7, 8], "21783": [3, 7], "lwx": 3, "lin": [3, 5, 6, 7, 8, 9], "rui": [3, 5, 7, 9], "ruixuan": 3, "junbo": 3, "zhao": [3, 5, 7, 8], "ding": 3, "gang": [3, 5], "haobo": 3, "driven": [3, 5, 7, 8], "survei": [3, 5, 8, 9], "2406": [3, 5, 6, 7, 8], "15126": 3, "met24": 3, "owj": 3, "jeff": [3, 5, 8], "diogo": [3, 8], "almeida": [3, 8], "carrol": [3, 8], "wainwright": [3, 8], "pamela": [3, 5, 8], "mishkin": [3, 5, 8], "chong": [3, 8], "sandhini": [3, 8], "agarw": [3, 5, 8], "katarina": [3, 8], "slama": [3, 8], "alex": [3, 5, 7, 8], "rai": [3, 5, 7, 8], "john": [3, 5, 6, 8], "hilton": [3, 5, 7, 8], "fraser": [3, 8], "kelton": 3, "miller": [3, 5], "maddi": [3, 8], "simen": [3, 8], "peter": [3, 5, 7, 8], "welind": [3, 5, 8], "paul": [3, 5, 8], "christiano": [3, 8], "leik": [3, 5, 8], "ryan": [3, 5, 8], "2203": 3, "02155": 3, "qwe24": 3, "rsm": 3, "archit": 3, "sharma": [3, 8], "eric": [3, 5, 7, 8], "mitchel": [3, 6, 7], "stefano": [3, 5], "ermon": [3, 5], "man": [3, 5, 8], "chelsea": [3, 8], "finn": 3, "secretli": 3, "18290": 3, "swd": 3, "17": [3, 5, 6, 7, 8], "filip": [3, 8], "wolski": 3, "prafulla": 3, "dhariw": 3, "alec": [3, 5, 8], "radford": [3, 5, 8], "oleg": [3, 8], "klimov": 3, "1707": 3, "06347": 3, "smollm224": 3, "distil": [3, 4], "smollm2360mi24": 3, "sou24": 3, "html": [3, 6, 9], "srverh24": 3, "m\u00e1rton": 3, "daniel": [3, 5, 8], "rueckert": 3, "r\u00fcdiger": 3, "von": [3, 5, 7], "eisenhart": 3, "roth": [3, 5], "florian": 3, "hinterwimm": 3, "2411": [3, 6], "09539": 3, "tm": [3, 7], "23": [3, 5, 6, 7, 8], "hugo": [3, 7], "loui": [3, 5, 7], "martin": [3, 5, 6, 7, 8], "kevin": [3, 5, 7, 8], "stone": [3, 7], "albert": [3, 7], "amjad": [3, 7], "almahairi": [3, 7], "yasmin": [3, 7], "babaei": [3, 7], "nikolai": [3, 7], "bashlykov": [3, 7], "soumya": [3, 7], "batra": [3, 7], "prajjwal": [3, 7], "bhargava": [3, 7], "shruti": [3, 7], "bhosal": [3, 7], "dan": [3, 5, 7, 8, 9], "bikel": [3, 7], "luka": [3, 7], "blecher": [3, 7], "cristian": [3, 7], "canton": [3, 7], "ferrer": [3, 7], "moya": [3, 7], "guillem": [3, 7], "cucurul": [3, 7], "esiobu": [3, 7], "jude": [3, 7], "fernand": [3, 7], "jeremi": [3, 5, 6, 7], "fu": [3, 7], "wenyin": [3, 7], "brian": [3, 7, 8], "fuller": [3, 7, 8], "cynthia": [3, 7], "gao": [3, 5, 7, 8], "vedanuj": [3, 7], "goswami": [3, 7, 8], "naman": [3, 6, 7], "goyal": [3, 6, 7], "anthoni": [3, 6, 7], "hartshorn": [3, 7], "saghar": [3, 7], "hosseini": [3, 7], "hakan": [3, 7], "inan": [3, 7], "marcin": [3, 7], "karda": [3, 7], "viktor": [3, 7], "kerkez": [3, 7], "madian": [3, 7], "khabsa": [3, 7], "isabel": [3, 7, 8], "kloumann": [3, 7], "artem": [3, 7], "korenev": [3, 7], "punit": [3, 7], "singh": [3, 5, 6, 7], "koura": [3, 7], "mari": [3, 5, 7, 8], "ann": [3, 7, 8], "lachaux": [3, 7], "thibaut": [3, 7], "lavril": [3, 7], "jenya": [3, 7], "diana": [3, 5, 7], "liskovich": [3, 7], "yinghai": [3, 7], "yune": [3, 7], "mao": [3, 4, 7], "xavier": [3, 7], "martinet": [3, 7], "todor": [3, 7, 8], "mihaylov": [3, 7], "pushkar": [3, 7], "mishra": [3, 5, 7], "igor": [3, 5, 7, 8], "molybog": [3, 7], "yixin": [3, 5, 7], "nie": [3, 5, 6, 7], "andrew": [3, 5, 7, 8], "poulton": [3, 7], "reizenstein": [3, 7], "rashi": [3, 7], "rungta": [3, 7], "kalyan": [3, 7], "saladi": [3, 7], "alan": [3, 7, 8], "schelten": [3, 7], "ruan": [3, 7], "silva": [3, 7], "ranjan": [3, 7], "subramanian": [3, 7], "xiaoq": [3, 7], "ellen": [3, 7], "tan": [3, 5, 6, 7], "binh": [3, 7], "ross": [3, 4, 7, 8], "taylor": [3, 7], "adina": [3, 7, 8], "jian": [3, 5, 6, 7], "kuan": [3, 7], "puxin": [3, 7], "yan": [3, 4, 5, 7], "iliyan": [3, 7], "zarov": [3, 7], "yuchen": [3, 5, 7, 8], "angela": [3, 5, 7, 8], "fan": [3, 5, 7], "melani": [3, 7], "kambadur": [3, 7], "sharan": [3, 7], "narang": [3, 7], "aurelien": [3, 7], "rodriguez": [3, 7], "stojnic": [3, 7], "sergei": [3, 7], "edunov": [3, 7], "thoma": [3, 5, 7, 8], "scialom": [3, 7], "2307": [3, 7, 9], "09288": [3, 7], "vaa": [3, 8], "berti": [3, 8], "adarsh": [3, 8], "agraw": [3, 8], "ahm": [3, 8], "victor": [3, 8], "akinwand": [3, 8], "namir": [3, 8], "nuaimi": [3, 8], "najla": [3, 8], "alfaraj": [3, 8], "alhajjar": [3, 8], "aroyo": [3, 8], "trupti": [3, 8], "bavalatti": [3, 8], "max": [3, 5, 6, 8], "bartolo": [3, 8], "borhan": [3, 8], "blili": [3, 8], "hamelin": [3, 8], "kurt": [3, 8], "bollack": [3, 8], "rishi": [3, 5, 7, 8], "bomassani": [3, 8], "marisa": [3, 8], "ferrara": [3, 8], "boston": [3, 8], "sim\u00e9on": [3, 8], "campo": [3, 8], "kal": [3, 8], "chakra": [3, 8], "canyu": [3, 8], "codi": [3, 8], "coleman": [3, 8], "zachari": [3, 5, 8], "delpierr": [3, 8], "coudert": [3, 8], "leon": [3, 8], "derczynski": [3, 8], "debojyoti": [3, 8], "dutta": [3, 8], "ian": [3, 5, 8], "eisenberg": [3, 8], "ezick": [3, 8], "heather": [3, 8], "frase": [3, 8], "ram": [3, 7, 8], "gandikota": [3, 8], "agasthya": [3, 8], "gangavarapu": [3, 8], "ananya": [3, 5, 8], "geali": [3, 8], "rajat": [3, 8], "ghosh": [3, 5, 8], "goel": [3, 5, 8], "usman": [3, 8], "gohar": [3, 8], "sujata": [3, 8], "hale": [3, 8], "wiebk": [3, 8], "hutiri": [3, 8], "marvin": [3, 8], "imperi": [3, 8], "surgan": [3, 8], "jandial": [3, 8], "nick": [3, 5, 8], "judd": [3, 8], "felix": [3, 5, 8], "juefei": [3, 8], "fouts": [3, 8], "khomh": [3, 8], "bhavya": [3, 8], "kailkhura": [3, 8], "hannah": [3, 5, 8], "rose": [3, 8], "kirk": [3, 8], "klyman": [3, 8], "knotz": [3, 8], "kuchnik": [3, 8], "shachi": [3, 8], "kumar": [3, 5, 8], "srijan": [3, 8], "lengerich": [3, 8], "bo": [3, 5, 7, 8], "zeyi": [3, 8], "liao": [3, 5, 8], "eileen": [3, 8], "sarah": [3, 5, 8], "luger": [3, 8], "yifan": [3, 5, 8], "priyanka": [3, 8], "mammen": [3, 8], "kelvin": [3, 6, 8], "manyeki": [3, 8], "mcgregor": [3, 8], "virendra": [3, 8], "mehta": [3, 5, 8], "shafe": [3, 8], "moham": [3, 8], "moss": [3, 8], "lama": [3, 8], "nachman": [3, 8], "dinesh": [3, 8], "jinenh": [3, 8], "naganna": [3, 8], "amin": [3, 8], "nikanjam": [3, 8], "besmira": [3, 8], "nushi": [3, 8], "lui": [3, 5, 8], "oala": [3, 8], "iftach": [3, 8], "orr": [3, 5, 8], "alicia": [3, 5, 8], "parrish": [3, 5, 8], "cigdem": [3, 8], "patlak": [3, 8], "pietri": [3, 8], "forough": [3, 8], "poursabzi": [3, 8], "sangdeh": [3, 8], "eleonora": [3, 8], "presani": [3, 8], "fabrizio": [3, 8], "puletti": [3, 8], "r\u00f6ttger": [3, 8], "sahai": [3, 8], "santo": [3, 8], "nino": [3, 8], "scherrer": [3, 8], "alic": [3, 5, 8, 9], "schoenauer": [3, 8], "sebag": [3, 8], "patrick": [3, 6, 8], "schramowski": [3, 8], "abolfazl": [3, 8], "shahbazi": [3, 8], "vin": [3, 8], "xudong": [3, 5, 6, 8], "vamsi": [3, 8], "sistla": [3, 8], "leonard": [3, 8], "testuggin": [3, 8], "vithursan": [3, 8], "thangarasa": [3, 8], "elizabeth": [3, 5, 8], "watkin": [3, 8], "rebecca": [3, 5, 8], "weiss": [3, 8], "welti": [3, 8], "tyler": [3, 5, 8], "wilber": [3, 8], "jean": [3, 8], "poonam": [3, 8], "yadav": [3, 8], "xianjun": [3, 8], "yang": [3, 5, 6, 7, 8, 9], "yi": [3, 5, 6, 8, 9], "wenhui": [3, 8], "fedor": [3, 8], "zhdanov": [3, 8], "jiacheng": [3, 5, 8], "perci": [3, 5, 8], "liang": [3, 5, 8, 9], "mattson": [3, 8], "joaquin": [3, 8], "vanschoren": [3, 8], "v0": [3, 8, 9], "12241": [3, 8], "wyg": 3, "tianhao": [3, 5, 7, 8], "weizh": 3, "yuan": [3, 5, 8], "olga": 3, "golovneva": 3, "jing": [3, 8], "yuandong": 3, "tian": 3, "jiantao": 3, "jiao": 3, "jason": [3, 5, 8], "weston": 3, "sainbayar": 3, "sukhbaatar": 3, "19594": 3, "xfg": 3, "shusheng": 3, "jiaxuan": 3, "wenji": 3, "ye": [3, 5, 7, 8, 9], "weilin": 3, "zhiyu": [3, 9], "mei": [3, 5, 7], "guangju": 3, "chao": 3, "10719": 3, "ywx": 3, "yueqin": 3, "zhendong": 3, "yujia": [3, 6], "xie": [3, 5], "mingyuan": 3, "paradigm": [3, 5], "semanticscholar": 3, "corpusid": 3, "270199610": 3, "suppos": [4, 5, 9], "econom": [4, 5, 6], "fuel": 4, "equival": [4, 5, 7], "consumpt": [4, 5], "contrari": 4, "truth": [4, 5, 7, 8, 9], "stanlei": 4, "jevon": 4, "a16z": 4, "andreessen": 4, "horowitz": 4, "10x": 4, "outpac": 4, "moor": 4, "pc": 4, "edholm": 4, "bandwidth": 4, "era": 4, "llmflation": 4, "mmlu": [4, 7, 8], "60": [4, 5, 6, 7, 8], "06": [4, 5, 6, 9], "price": [4, 5, 6, 7], "fallen": 4, "62": [4, 5, 7], "introduct": 4, "march": [4, 5, 9], "stem": [4, 5, 9], "compound": 4, "bit": [4, 6, 7], "tune": [4, 5, 8], "dpo": [4, 7], "competit": [4, 5, 6, 7, 8], "plummet": 4, "rapidli": [4, 7, 8, 9], "preciou": 4, "wouldn": [4, 5], "sens": [4, 8], "wait": [4, 5, 8], "wave": 4, "economist": 4, "1865": 4, "studi": [4, 9], "coal": 4, "industri": [4, 5, 6, 7, 8, 9], "made": [4, 5, 6, 7, 9], "counterintuit": 4, "discoveri": 4, "steam": 4, "spend": [4, 5, 6], "repeat": [4, 6], "didn": [4, 9], "smartphon": [4, 5, 6, 7], "server": [4, 5, 7, 9], "network": [4, 5, 6, 7, 9], "transmiss": 4, "got": 4, "cheaper": [4, 5], "shift": [4, 5, 6], "hd": 4, "stream": [4, 5, 6, 7, 9], "storag": [4, 5, 7, 8], "gigabyt": 4, "massiv": [4, 5, 8], "broadli": [4, 6, 7, 9], "audio": [4, 5, 6], "transcript": [4, 6], "multimod": [4, 7, 8], "imag": [4, 5, 6, 7, 8], "exponenti": [4, 5], "growth": [4, 5, 6], "magnifi": 4, "everyth": [4, 9], "billion": [4, 5, 6, 7], "dollar": [4, 5, 7], "annual": [4, 5, 6, 8], "millisecond": [4, 5], "latenc": [4, 5, 7, 8], "30": [4, 5, 6, 7, 8], "mobil": [4, 5, 7, 9], "tradeoff": [4, 7, 8, 9], "pro": [4, 5, 6, 7, 8], "trigger": [4, 6, 8], "premium": [4, 5], "innov": [4, 5, 6, 7, 8], "capac": [4, 5, 6, 7], "link": [4, 5], "dual": 4, "character": [4, 5, 8], "ahead": [4, 7, 8], "decai": [4, 7], "discuss": [4, 5, 7, 8], "area": [4, 5, 8, 9], "flash": [4, 6, 7], "cach": [4, 5, 6, 7], "compress": [4, 5, 7], "provis": [4, 5], "extent": [4, 5, 8], "problema": 4, "accomplish": [4, 8, 9], "accompani": [4, 5, 8], "transact": [4, 5, 8], "roi": 4, "alloc": [4, 5, 6, 7, 8], "budget": [4, 7], "viabil": [4, 7], "prioriti": [4, 5, 7], "overlook": [4, 6], "thorough": [4, 7, 8], "identif": [4, 5], "specifi": [4, 5, 6, 7, 8, 9], "longev": 4, "accommod": 4, "evalu": [4, 6, 7, 9], "multi": [4, 5, 6, 7, 8, 9], "baselin": [4, 5, 7, 8], "met": [4, 5, 8], "equal": [4, 5, 6, 8], "concurr": [4, 7], "peak": 4, "spike": 4, "versu": [4, 5, 7, 8], "volum": [4, 5, 7, 8], "season": [4, 5], "variat": [4, 5, 7], "uptim": 4, "mainten": [4, 5, 7, 8], "disrupt": [4, 5, 6], "backup": 4, "failov": 4, "clearli": [4, 5, 8, 9], "redund": [4, 5], "recoveri": [4, 5], "unexpect": [4, 5, 8, 9], "event": [4, 5], "seamless": [4, 5, 8], "broader": [4, 5, 6, 7, 8], "vector": [4, 7, 8], "retriev": [4, 5, 7], "augment": [4, 5, 7], "rag": [4, 7], "retent": [4, 5, 6], "polici": [4, 5, 6, 7], "essenti": [4, 5, 6, 7, 8, 9], "opportun": [4, 5, 6], "post": [4, 5, 7, 8], "32": [4, 5, 6, 7], "fp32": 4, "fp16": [4, 7], "proport": [4, 5, 7], "byte": 4, "120": [4, 5, 8], "gb": 4, "whole": [4, 5], "done": [4, 5, 7, 8, 9], "smollm2": [4, 5, 7, 9], "135m": [4, 7], "load_gguf": 4, "bartowski": 4, "gguf": [4, 7], "gguf_file_q2_k": 4, "q2_k": [4, 7], "gguf_file_f16": 4, "f16": 4, "model_q2_k": 4, "gguf_fil": 4, "model_f16": 4, "mlp": 4, "layer": [4, 5, 7, 9], "proxi": [4, 5, 6, 8], "mlp_weights_q2_k": 4, "gate_proj": 4, "mlp_weights_f16": 4, "tensor": [4, 9], "0145": 4, "1826": 4, "1377": 4, "1719": 4, "1387": 4, "0298": 4, "1631": 4, "0781": 4, "2051": [4, 5], "2070": 4, "0334": 4, "2891": 4, "1768": 4, "0488": 4, "2393": 4, "0396": 4, "1348": 4, "1533": 4, "0771": 4, "0845": 4, "0232": 4, "0178": 4, "1040": 4, "1582": 4, "1167": 4, "0474": 4, "0359": 4, "2500": 4, "0432": 4, "0972": 4, "0933": 4, "2188": 4, "0776": 4, "0674": 4, "requires_grad": 4, "0028": 4, "1852": 4, "1396": 4, "1506": 4, "1635": 4, "0043": 4, "0680": 4, "2257": 4, "1890": 4, "0464": 4, "2960": 4, "1840": 4, "0451": 4, "2395": 4, "0413": 4, "1446": 4, "0621": 4, "0478": 4, "0038": 4, "0830": 4, "1473": 4, "0926": 4, "0547": 4, "0824": 4, "0429": 4, "2737": 4, "0355": 4, "0782": 4, "2043": [4, 5], "0740": 4, "arriv": [4, 5], "pearson": 4, "numpi": [4, 5], "np": [4, 5], "arrai": [4, 8], "detach": 4, "graph": [4, 5], "weights_f16": 4, "weights_q2_k": 4, "flat_f16": 4, "flatten": 4, "flat_q2_k": 4, "corrcoef": 4, "4f": [4, 9], "9970": 4, "exemplifi": [4, 6, 7, 8], "70b": [4, 5, 7], "unsloth": 4, "141": 4, "q8_0": [4, 7], "75": [4, 8], "47": [4, 5, 7, 8], "cumul": [4, 5, 6], "26": [4, 5, 7], "19": [4, 5, 6, 7, 8], "space": [4, 5, 7, 8], "counterpart": 4, "spectrum": [4, 5, 6], "variant": [4, 5, 7, 8], "laptop": [4, 5], "desktop": [4, 5, 7], "enterpris": [4, 5, 7, 8, 9], "ceil": 4, "notabl": [4, 5, 8, 9], "bitnet": 4, "cpp": [4, 9], "arm": 4, "x86": 4, "speedup": [4, 7], "37x": 4, "07x": 4, "17x": 4, "beyond": [4, 5, 8], "raw": [4, 5, 7, 8, 9], "speed": [4, 5, 7, 8], "energi": [4, 5, 6], "55": [4, 5, 6, 7], "70": [4, 5, 7], "71": [4, 5], "82": [4, 8], "impress": [4, 7, 9], "100b": 4, "b1": 4, "58": [4, 6, 7], "pace": [4, 5, 6, 8], "second": [4, 5, 6, 7, 8], "kernel": 4, "characterist": [4, 5, 7, 8, 9], "excit": [4, 7], "frontier": [4, 8], "compel": [4, 5, 7, 9], "acceler": [4, 5, 7, 8], "faster": [4, 7], "arithmet": [4, 5], "benefici": [4, 5, 7], "sustain": [4, 5, 6, 7, 8], "Be": [4, 5, 7, 8], "fine": [4, 5, 8], "pure": [4, 5, 7], "unlock": [4, 9], "track": [4, 5, 6, 8], "chargeback": 4, "regularli": [4, 5], "wz": 4, "jinheng": 4, "hansong": 4, "ting": [4, 8], "shaoguang": 4, "shume": 4, "ma": [4, 5, 8], "hongyu": [4, 5], "xia": [4, 5, 7], "infra": 4, "fast": [4, 5, 7, 8, 9], "lossless": 4, "16144": 4, "andreessenhorowitz24": 4, "huggingface4w": [4, 7], "2024w": [4, 7], "unsloth24": 4, "jonathan": [4, 5, 8], "ceo": [4, 5], "groq": [4, 7], "streamlin": [4, 6, 7, 9], "notat": 4, "width": [4, 7], "_k": 4, "_0": 4, "matter": [5, 6], "beauti": 5, "smart": [5, 8], "agre": 5, "wrong": 5, "feynman": 5, "advent": 5, "norm": 5, "realm": 5, "convent": [5, 8], "evolut": [5, 7], "conceiv": 5, "entrench": 5, "seem": 5, "daunt": [5, 6], "ignor": 5, "outdat": [5, 6, 8, 9], "inevit": 5, "setback": 5, "imper": 5, "embrac": 5, "proactiv": [5, 8], "mindset": 5, "front": [5, 7], "produc": [5, 6, 7, 8, 9], "novel": [5, 7], "ident": [5, 6], "isn": [5, 8], "bug": 5, "random": [5, 8, 9], "testabl": 5, "exceedingli": 5, "guarante": [5, 6, 7, 8, 9], "primari": [5, 8], "nucleu": 5, "2020": 5, "summari": [5, 6, 7, 8, 9], "alter": 5, "rigid": 5, "wildli": 5, "incoher": 5, "inadequ": [5, 8], "temp": 5, "df_result": 5, "ntemperatur": 5, "40": [5, 6, 7], "temp_respons": 5, "iterrow": [5, 8], "10000": [5, 6, 9], "appl": [5, 6, 9], "txt": [5, 6, 7, 9], "sec_fil": [5, 9], "nsecur": 5, "AND": [5, 9], "exchang": [5, 6, 8, 9], "commiss": [5, 6, 8, 9], "nwashington": 5, "20549": 5, "nform": 5, "pursuant": 5, "TO": [5, 8], "13": [5, 6, 7, 8], "OR": 5, "OF": [5, 8], "THE": [5, 8], "1934": 5, "nfor": 5, "fiscal": [5, 6], "septemb": [5, 6], "28": [5, 6, 7, 8], "nor": 5, "period": [5, 6, 8], "ncommiss": 5, "001": [5, 7], "36743": 5, "ng66145g66i43": 5, "jpg": 5, "nappl": 5, "exact": [5, 7, 8], "registr": 5, "charter": 5, "ncalifornia": 5, "t94": 5, "2404110": 5, "jurisdict": 5, "nof": 5, "incorpor": [5, 6, 7, 8, 9], "employ": 5, "park": 5, "ncupertino": 5, "california": [5, 8, 9], "n95014": 5, "princip": 5, "offic": [5, 6, 8], "408": 5, "996": 5, "1010": 5, "telephon": 5, "regist": 5, "ntitl": 5, "ttrade": 5, "symbol": 5, "tname": 5, "ncommon": 5, "stock": [5, 9], "00001": 5, "naapl": 5, "tthe": 5, "nasdaq": [5, 6, 9], "llc": [5, 9], "n0": 5, "000": [5, 7, 9], "note": [5, 6, 7, 8, 9], "2025": [5, 6], "875": 5, "625": 5, "2026": 5, "2027": 5, "375": 5, "2029": 5, "050": 5, "2031": [5, 8], "600": 5, "2042": 5, "nindic": 5, "issuer": 5, "405": 5, "nye": 5, "preced": [5, 9], "shorter": 5, "past": [5, 6, 8], "90": [5, 6, 7, 8], "submit": [5, 7, 8], "electron": 5, "232": 5, "filer": 5, "12b": [5, 8], "nlarg": 5, "tacceler": 5, "nnon": 5, "tsmaller": 5, "nemerg": 5, "nif": 5, "elect": [5, 8], "revis": [5, 8], "attest": 5, "404": 5, "sarban": 5, "oxlei": 5, "7262": 5, "firm": [5, 8], "prepar": [5, 6, 7, 8], "correct": [5, 6, 8], "restat": 5, "incent": 5, "compens": 5, "240": 5, "10d": 5, "shell": 5, "aggreg": [5, 8], "vote": 5, "held": [5, 9], "affili": [5, 9], "29": [5, 7, 8, 9], "last": [5, 6, 8, 9], "quarter": 5, "628": [5, 9], "553": [5, 9], "sole": [5, 8], "disclosur": [5, 6, 7, 8], "director": [5, 7, 8], "date": [5, 6], "exclud": 5, "n15": 5, "115": [5, 9], "823": [5, 9], "outstand": [5, 9], "octob": [5, 9], "18": [5, 6, 7, 8, 9], "ndocument": 5, "BY": 5, "nportion": 5, "meet": [5, 6, 8, 9], "sharehold": [5, 6], "iii": 5, "ntabl": 5, "npage": 5, "npart": 5, "nitem": 5, "nbusi": 5, "1a": 5, "nrisk": 5, "1b": [5, 7, 8], "nunresolv": 5, "staff": 5, "comment": 5, "n17": 5, "1c": 5, "ncybersecur": 5, "nproperti": 5, "n18": 5, "nlegal": 5, "proceed": [5, 6, 8], "nmine": 5, "ii": [5, 7, 9], "nmarket": 5, "stockhold": 5, "purchas": [5, 6, 8], "n19": 5, "reserv": [5, 6], "n20": 5, "nmanag": 5, "n21": 5, "7a": 5, "nquantit": 5, "n27": 5, "nfinanci": 5, "supplementari": 5, "n28": 5, "nchang": 5, "disagr": 5, "n51": 5, "9a": 5, "ncontrol": 5, "procedur": [5, 6, 8], "9b": 5, "nother": 5, "n52": 5, "9c": 5, "ndisclosur": 5, "foreign": [5, 6], "ndirector": 5, "corpor": [5, 6, 8], "nexecut": 5, "ownership": [5, 7], "certain": [5, 6, 8, 9], "owner": 5, "ncertain": 5, "nprincip": 5, "fee": [5, 6], "iv": 5, "nexhibit": 5, "n53": 5, "n56": 5, "nthi": 5, "litig": [5, 6, 7], "reform": 5, "1995": 5, "uncertainti": [5, 6, 7, 8], "macroeconom": [5, 6], "anticip": [5, 6, 8], "caus": [5, 8], "oblig": [5, 6], "nunless": 5, "herein": 5, "calendar": 5, "wholli": 5, "subsidiari": 5, "unless": [5, 7], "ncompani": 5, "manufactur": 5, "tablet": [5, 6, 7], "wearabl": 5, "accessori": 5, "sell": [5, 8], "varieti": [5, 7], "52": [5, 8], "53": [5, 6, 8], "week": 5, "saturdai": 5, "nproduct": 5, "niphon": 5, "io": [5, 6, 9], "iphon": [5, 6], "se": [5, 8], "nmac": 5, "maco": [5, 7], "mac": [5, 7], "macbook": 5, "air": 5, "imac": 5, "studio": 5, "nipad": 5, "multipurpos": 5, "ipado": 5, "ipad": 5, "nwearabl": 5, "home": [5, 6, 9], "smartwatch": 5, "wireless": 5, "headphon": 5, "spatial": 5, "watcho": 5, "watch": 5, "ultra": 5, "airpod": 5, "beat": [5, 7], "visiono": 5, "nhome": 5, "tv": 5, "tvo": 5, "homepod": 5, "fidel": [5, 9], "naccessori": 5, "brand": 5, "third": [5, 6, 7, 8], "parti": [5, 6, 7, 8], "nservic": 5, "nadvertis": 5, "advertis": 5, "licens": [5, 6], "napplecar": 5, "portfolio": [5, 6], "applecar": 5, "repair": 5, "coverag": [5, 6, 8], "accident": 5, "damag": [5, 8], "theft": [5, 8], "ncloud": 5, "ndigit": 5, "app": [5, 6, 7], "discov": [5, 7, 8], "download": [5, 7], "music": 5, "podcast": 5, "subscript": [5, 7], "arcad": 5, "sm": 5, "listen": [5, 7], "radio": 5, "station": 5, "magazin": 5, "exclus": 5, "sport": 5, "npayment": 5, "payment": 5, "credit": [5, 6], "pai": [5, 7], "cashless": 5, "nsegment": 5, "primarili": [5, 6, 8], "geograph": [5, 6, 8], "basi": [5, 7], "segment": [5, 6, 9], "america": [5, 6], "europ": 5, "china": [5, 6, 7, 8], "japan": 5, "rest": [5, 7], "asia": 5, "pacif": 5, "north": [5, 8], "south": 5, "european": [5, 8], "india": 5, "middl": [5, 7, 8], "east": 5, "africa": 5, "mainland": 5, "kong": 5, "taiwan": 5, "australia": 5, "asian": [5, 6], "although": [5, 7], "partner": [5, 6, 7, 8], "mid": [5, 6], "resel": [5, 6], "retail": 5, "sale": [5, 6], "indirect": 5, "channel": [5, 8], "cellular": 5, "carrier": 5, "net": [5, 6, 9], "38": [5, 6, 7, 8], "ncompetit": 5, "downward": 5, "pressur": [5, 8], "gross": [5, 8], "cycl": [5, 8], "competitor": [5, 6, 7, 8], "compet": [5, 6, 7], "imit": 5, "infring": [5, 7], "intellectu": [5, 7, 8], "marketplac": [5, 8], "nearli": [5, 7], "reput": [5, 8], "expand": [5, 6, 7, 8], "illegitim": [5, 8], "collabor": [5, 7, 8], "nsuppli": 5, "nalthough": 5, "particip": 5, "shortag": 5, "commod": [5, 6, 7], "fluctuat": [5, 6], "commonli": 5, "until": [5, 8, 9], "supplier": 5, "matur": 5, "concentr": [5, 6], "enter": [5, 9], "agreement": [5, 6], "suppli": [5, 6, 9], "renew": [5, 6], "nresearch": 5, "nbecaus": 5, "upon": [5, 6, 8], "flow": [5, 6, 9], "acquisit": [5, 6, 8], "nintellectu": 5, "broad": [5, 6, 7, 9], "patent": 5, "copyright": [5, 7], "trademark": 5, "secret": 5, "differenti": 5, "skill": [5, 8], "personnel": 5, "pursu": [5, 8], "thousand": [5, 7], "durat": 5, "adequ": [5, 8], "nin": 5, "holidai": [5, 8], "fill": 5, "inventori": 5, "older": [5, 7], "newer": 5, "distributor": 5, "nhuman": 5, "strive": 5, "retain": [5, 6, 7, 8], "talent": [5, 6], "member": [5, 8], "164": 5, "ncompens": 5, "equit": 5, "succe": 5, "health": [5, 6, 8], "awai": [5, 6, 8], "ngrowth": 5, "career": 5, "leadership": [5, 8], "nworkplac": 5, "workplac": 5, "ninclus": 5, "workforc": 5, "nengag": 5, "among": [5, 7, 8, 9], "gaug": 5, "sentiment": [5, 6, 7, 9], "nhealth": 5, "everywher": 5, "crisi": 5, "visitor": 5, "navail": 5, "quarterli": 5, "q": [5, 6, 7, 8], "amend": 5, "sec": [5, 6, 9], "Such": [5, 8], "charg": 5, "investor": [5, 6, 9], "aspx": 5, "websit": [5, 7, 8], "environment": [5, 8], "referenc": [5, 6], "inact": 5, "textual": 5, "unknown": [5, 6, 8], "advers": 5, "conjunct": 5, "consolid": [5, 6], "nmacroeconom": 5, "facil": 5, "assembli": 5, "site": [5, 9], "nadvers": 5, "slow": 5, "recess": 5, "unemploy": [5, 6], "inflat": [5, 6], "tighter": 5, "currenc": [5, 6], "monetari": 5, "contract": [5, 7], "logist": 5, "instabl": [5, 8], "inabl": [5, 6], "financ": [5, 7, 8], "insolv": 5, "counterparti": 5, "debt": 5, "liquid": [5, 6], "fair": [5, 8], "instrument": 5, "polit": [5, 8], "disput": 5, "geopolit": 5, "tension": [5, 8], "terror": 5, "accid": 5, "interrupt": 5, "npolit": 5, "outsourc": [5, 6], "korea": 5, "vietnam": 5, "restrict": [5, 7, 8, 9], "tariff": 5, "export": [5, 6], "portion": [5, 7], "revenu": [5, 6, 9], "restructur": 5, "ceas": 5, "escal": [5, 8], "nmani": 5, "prone": [5, 8], "earthquak": 5, "climat": 5, "weather": 5, "plant": 5, "terrorist": [5, 8], "attack": [5, 8], "hostil": 5, "ransomwar": 5, "cybersecur": [5, 6, 8], "labor": 5, "nsuch": 5, "imposs": [5, 7], "slowdown": 5, "outag": 5, "neg": [5, 6, 8, 9], "pandem": 5, "covid": 5, "economi": 5, "imposit": 5, "stringent": [5, 7, 8], "travel": 5, "freight": 5, "movement": 5, "ramp": 5, "nfollow": 5, "expenditur": 5, "resum": 5, "exacerb": 5, "insur": 5, "nglobal": 5, "unabl": 5, "assur": [5, 8], "minor": [5, 6, 8], "naddition": 5, "intensifi": 5, "seamlessli": 5, "nto": 5, "stimul": 5, "ndue": 5, "upgrad": 5, "quantiti": 5, "defect": 5, "defici": 5, "supersed": 5, "nsubstanti": 5, "transport": 5, "reimburs": 5, "warranti": 5, "unanticip": 5, "liabil": 5, "final": [5, 6, 8, 9], "finish": [5, 8], "destin": 5, "prepay": 5, "termin": [5, 7], "recover": 5, "exposur": [5, 8], "nfutur": 5, "semiconductor": 5, "suffer": [5, 8], "constrain": [5, 7, 9], "shipment": 5, "unexpectedli": 5, "interfer": 5, "unsaf": [5, 8], "expos": [5, 6, 8], "widespread": [5, 8], "vulner": [5, 6, 8], "compromis": [5, 7, 8], "claim": [5, 6, 7, 8], "intang": 5, "lost": [5, 6, 8], "cancel": 5, "obsolet": 5, "exce": [5, 8], "realiz": 5, "accru": 5, "excess": 5, "impair": 5, "whenev": 5, "circumst": 5, "amount": [5, 6, 8, 9], "carri": [5, 7, 9], "incur": 5, "unpredict": [5, 8], "obsolesc": 5, "forecast": [5, 6, 8], "incorrectli": [5, 8, 9], "extens": [5, 7, 9], "issuanc": 5, "unknowingli": [5, 8], "notifi": 5, "preclud": 5, "bui": 5, "percept": 5, "android": [5, 6], "playstat": 5, "nintendo": 5, "xbox": 5, "inclin": 5, "devot": 5, "dissatisfi": 5, "vast": [5, 8], "storefront": 5, "safari": 5, "union": [5, 8], "eu": [5, 6, 8], "dma": [5, 6], "narrow": [5, 7, 8], "scope": [5, 6, 7, 8], "elimin": [5, 7], "nfailur": 5, "appeal": [5, 6], "subscrib": [5, 6], "nsome": 5, "manner": [5, 6, 8], "nurtur": 5, "nmuch": 5, "chief": [5, 6], "silicon": 5, "vallei": 5, "constantli": 5, "driver": [5, 7], "recruit": 5, "subsidi": 5, "staf": 5, "contractor": 5, "placement": 5, "increment": 5, "weaken": 5, "telecommun": 5, "war": 5, "virus": 5, "ins": 5, "incid": [5, 8], "ineffect": 5, "thing": [5, 9], "interf": 5, "imped": 5, "ship": 5, "nloss": 5, "unauthor": [5, 8], "confidenti": [5, 7], "encrypt": 5, "But": [5, 8, 9], "behalf": 5, "normal": [5, 6, 8, 9], "investig": [5, 8], "penalti": [5, 7], "frequenc": [5, 7, 8], "actor": [5, 8], "circumv": [5, 8], "obfusc": 5, "forens": 5, "hinder": [5, 9], "recov": 5, "perpetr": 5, "profil": [5, 7], "authent": 5, "hack": [5, 8], "malfeas": 5, "faulti": 5, "password": 5, "irregular": 5, "fraudul": 5, "induc": 5, "disclos": [5, 6, 9], "usernam": 5, "turn": [5, 6, 8], "multifactor": 5, "unusu": 5, "freez": 5, "suspici": 5, "nwhile": 5, "ninvest": 5, "ongo": [5, 6, 7], "contempl": 5, "endeavor": 5, "distract": 5, "tangibl": 5, "approv": 5, "oner": 5, "ventur": 5, "riski": 5, "leas": 5, "unfavor": [5, 6], "arisen": 5, "ordinari": 5, "cours": [5, 7, 8], "resolv": [5, 7, 8], "sometim": [5, 9], "indemnif": 5, "indemnifi": 5, "alleg": 5, "magnitud": 5, "assert": [5, 6], "royalti": 5, "vigor": 5, "defend": 5, "court": [5, 7], "internation": 5, "plaintiff": 5, "injunct": 5, "relief": 5, "nregardless": 5, "merit": 5, "recognit": [5, 7, 8], "settl": 5, "uncertain": [5, 6], "disgorg": 5, "remedi": [5, 8], "worldwid": 5, "antitrust": [5, 6], "bill": [5, 6], "commerc": 5, "televis": 5, "film": 5, "anticorrupt": 5, "cash": [5, 6], "repatri": 5, "launder": 5, "tax": [5, 6], "wast": 5, "recycl": 5, "ncomplianc": 5, "impos": [5, 7, 8, 9], "agent": [5, 7, 8], "nregulatori": 5, "ban": [5, 8], "nexpect": 5, "increasingli": [5, 7, 8, 9], "greenhous": 5, "ga": 5, "emiss": 5, "civil": 5, "disagre": 5, "perceiv": 5, "feder": 5, "nfrom": 5, "noncompli": 5, "individu": [5, 6, 7, 8], "lawsuit": [5, 7], "monopol": 5, "nfurther": 5, "earn": 5, "search": [5, 6, 7, 8], "nthere": 5, "transfer": 5, "pass": [5, 6, 7, 8, 9], "pend": 5, "inquiri": [5, 8], "government": 5, "entiti": [5, 7, 8, 9], "biometr": 5, "notif": 5, "permit": [5, 7, 9], "healthcar": [5, 6, 7], "liabl": 5, "investigatori": 5, "cardhold": 5, "acquir": 5, "denomin": 5, "offset": 5, "strengthen": [5, 8], "nconvers": 5, "thu": 5, "hedg": 5, "deterior": 5, "sovereign": 5, "heighten": [5, 8], "worsen": 5, "A": [5, 6, 7, 8, 9], "collater": 5, "bank": 5, "unsecur": 5, "subassembli": 5, "assembl": 5, "legisl": 5, "ireland": [5, 8], "singapor": 5, "organis": 5, "statutori": 5, "valuat": [5, 6], "defer": 5, "bodi": [5, 8], "adequaci": 5, "ow": 5, "ngener": 5, "repurchas": 5, "dividend": 5, "consumm": 5, "declar": [5, 6], "board": [5, 6, 8], "unresolv": 5, "nnone": 5, "threat": [5, 6, 8], "postur": 5, "25": [5, 6, 7, 8], "2016": 5, "coordin": [5, 8], "committe": [5, 8], "oversight": [5, 8], "counsel": 5, "chair": 5, "headquart": 5, "cupertino": [5, 9], "center": [5, 8, 9], "formal": [5, 8, 9], "conclud": [5, 7], "uninstal": 5, "web": [5, 6, 7, 8], "browser": 5, "june": 5, "contractu": 5, "desist": 5, "stai": [5, 7], "grant": 5, "ndepart": 5, "justic": 5, "depart": [5, 8], "doj": 5, "district": 5, "attornei": 5, "jersei": 5, "redress": [5, 8], "anticompetit": 5, "nonmonetari": 5, "defens": [5, 8], "nepic": 5, "epic": 5, "northern": 5, "unfair": [5, 8], "enjoin": 5, "extern": [5, 6, 8], "januari": 5, "motion": 5, "oppos": [5, 8], "vacat": 5, "fourth": 5, "mine": 5, "nnot": 5, "aapl": 5, "nholder": 5, "na": [5, 8], "301": 5, "npurchas": 5, "nshare": 5, "nperiod": 5, "ttotal": 5, "taverag": 5, "npaid": 5, "nannounc": 5, "napproxim": 5, "That": [5, 6, 8, 9], "nunder": 5, "njune": 5, "august": [5, 6, 8], "nopen": 5, "negoti": [5, 8], "t35": 5, "697": 5, "t224": 5, "naugust": 5, "31": [5, 6, 7], "t42": 5, "910": 5, "t221": 5, "39": [5, 6, 7], "nseptemb": 5, "t33": 5, "653": 5, "t222": 5, "86": [5, 7], "ntotal": [5, 8], "t112": 5, "260": 5, "t89": 5, "074": 5, "110": 5, "10b5": 5, "reinvest": 5, "dow": 5, "supersector": 5, "27": [5, 7, 8], "2019": 5, "n2218": 5, "tseptemb": 5, "t100": 5, "t207": 5, "t273": 5, "t281": 5, "t322": 5, "t430": 5, "t113": 5, "t156": 5, "t131": 5, "t155": 5, "t210": 5, "ndow": 5, "t146": 5, "t216": 5, "t215": 5, "nfirst": 5, "nsecond": 5, "nthird": 5, "sequoia": 5, "nfourth": 5, "plu": [5, 7], "nfiscal": 5, "six": 5, "realign": 5, "span": [5, 7, 8], "indirectli": 5, "n2024": 5, "tchang": 5, "t2023": 5, "t2022": 5, "namerica": 5, "t167": 5, "045": 5, "t3": 5, "t162": 5, "560": 5, "t169": 5, "658": 5, "neurop": 5, "t101": 5, "328": 5, "t7": 5, "294": 5, "t95": 5, "118": 5, "ngreater": 5, "t66": 5, "952": 5, "t72": 5, "559": 5, "t74": 5, "njapan": 5, "t25": 5, "052": 5, "t24": 5, "257": 5, "977": 5, "nrest": 5, "t30": 5, "t4": 5, "t29": 5, "615": 5, "t1": 5, "t391": 5, "035": 5, "t2": 5, "t383": 5, "285": 5, "t394": 5, "weak": [5, 6, 8], "renminbi": 5, "yen": [5, 9], "t201": 5, "183": 5, "t200": 5, "583": 5, "t205": 5, "489": 5, "984": 5, "357": 5, "t40": 5, "177": [5, 8], "t26": 5, "694": 5, "t28": 5, "300": 5, "292": 5, "t37": 5, "005": 5, "t39": 5, "845": [5, 8], "t41": 5, "241": 5, "n96": 5, "169": 5, "t13": 5, "t85": 5, "t9": 5, "t78": 5, "129": [5, 8], "amort": 5, "bundl": 5, "flat": [5, 6], "ngross": 5, "t109": 5, "633": 5, "t108": 5, "803": 5, "t114": 5, "728": 5, "t71": 5, "t60": 5, "345": 5, "t56": 5, "054": 5, "t180": 5, "683": 5, "148": 5, "t170": 5, "782": 5, "t36": 5, "t73": 5, "t70": 5, "t46": 5, "t44": 5, "t43": 5, "noper": 5, "t31": 5, "370": 5, "t5": 5, "915": 5, "t14": 5, "251": 5, "npercentag": 5, "t8": 5, "nsell": 5, "administr": 5, "097": 5, "932": 5, "094": 5, "t6": 5, "t57": 5, "467": 5, "t54": 5, "847": 5, "t51": 5, "t15": 5, "headcount": 5, "nprovis": 5, "749": 5, "t16": 5, "741": 5, "t19": 5, "neffect": 5, "nstatutori": 5, "t21": 5, "aid": [5, 8], "nliquid": 5, "unrestrict": 5, "140": 5, "ndebt": 5, "97": [5, 6, 8], "payabl": 5, "promissori": 5, "nleas": 5, "nmanufactur": 5, "noncancel": 5, "ndeem": 5, "tcja": 5, "nstate": 5, "fund": [5, 6, 7], "escrow": 5, "ncapit": 5, "95": [5, 8], "nrecent": 5, "pronounc": 5, "nincom": 5, "fasb": 5, "asu": 5, "09": [5, 6, 8], "740": 5, "reconcili": [5, 6], "reconcil": [5, 9], "disaggreg": 5, "prospect": 5, "novemb": [5, 8], "07": [5, 6, 8, 9], "280": 5, "maker": 5, "codm": 5, "retrospect": 5, "ncritic": 5, "conform": [5, 9], "gaap": 5, "nuncertain": 5, "domest": 5, "taxat": 5, "resolut": [5, 6], "conting": 5, "ninterest": 5, "forth": 5, "hypothet": 5, "nsensit": 5, "nhypothet": 5, "nrate": 5, "npotenti": 5, "n100": 5, "tenor": 5, "ndeclin": 5, "755": 5, "089": 5, "nterm": 5, "nincreas": 5, "t139": 5, "t194": 5, "nforeign": 5, "var": 5, "mont": 5, "carlo": 5, "interv": 5, "538": 5, "669": 5, "nindex": 5, "tpage": 5, "nconsolid": 5, "n29": 5, "n30": 5, "sheet": 5, "n31": 5, "n32": 5, "n33": 5, "nnote": 5, "n34": 5, "nreport": 5, "n48": 5, "nall": 5, "omit": [5, 9], "submiss": 5, "nyear": 5, "n2023": 5, "n2022": 5, "nnet": 5, "t294": 5, "866": 5, "t298": 5, "085": 5, "t316": 5, "199": 5, "t96": 5, "ncost": 5, "t185": 5, "233": 5, "t189": 5, "282": 5, "471": 5, "119": 5, "855": 5, "t22": 5, "075": 5, "352": 5, "t214": 5, "137": 5, "t223": 5, "546": 5, "t123": 5, "216": 5, "t119": 5, "437": 5, "t269": 5, "565": 5, "334": 5, "485": 5, "736": 5, "103": 5, "t93": 5, "995": 5, "t99": 5, "nearn": 5, "nbasic": 5, "ndilut": 5, "08": [5, 7, 9], "343": [5, 8], "783": 5, "744": 5, "215": 5, "963": 5, "095": 5, "812": 5, "547": 5, "325": 5, "819": 5, "nsee": 5, "translat": [5, 7, 8], "t395": 5, "765": 5, "511": 5, "unreal": 5, "832": 5, "t323": 5, "212": 5, "nadjust": 5, "337": 5, "717": 5, "394": 5, "138": 5, "850": 5, "563": 5, "104": 5, "t204": 5, "t253": 5, "816": 5, "899": 5, "272": 5, "t98": 5, "016": 5, "652": 5, "t88": 5, "531": 5, "nasset": 5, "ncurrent": 5, "ncash": 5, "943": 5, "965": 5, "228": 5, "590": 5, "naccount": 5, "410": 5, "508": 5, "nvendor": 5, "t32": 5, "833": 5, "477": 5, "ninventori": 5, "286": 5, "331": 5, "287": 5, "695": 5, "t152": 5, "987": 5, "t143": 5, "566": 5, "t91": 5, "479": 5, "544": 5, "t45": 5, "680": 5, "715": 5, "834": 5, "t64": 5, "758": 5, "t211": 5, "993": 5, "t209": 5, "017": 5, "t364": 5, "980": [5, 8], "t352": 5, "nliabil": 5, "t68": 5, "960": 5, "t62": 5, "611": 5, "304": 5, "t58": 5, "829": 5, "ndefer": 5, "249": 5, "061": 5, "ncommerci": 5, "967": 5, "985": 5, "t10": 5, "912": 5, "822": 5, "t176": 5, "392": 5, "t145": 5, "308": 5, "750": 5, "888": 5, "t49": 5, "848": 5, "638": 5, "t308": 5, "030": [5, 7], "t290": 5, "ncommit": 5, "nsharehold": 5, "400": [5, 6], "116": 5, "786": 5, "550": 5, "n83": 5, "276": 5, "naccumul": 5, "deficit": 5, "154": 5, "214": 5, "172": 5, "452": 5, "950": 5, "146": [5, 8], "t50": 5, "672": 5, "t63": 5, "090": 5, "nbegin": 5, "849": 5, "365": 5, "423": 5, "346": [5, 6], "175": 5, "withheld": 5, "settlement": 5, "521": 5, "971": 5, "t12": 5, "034": 5, "t11": 5, "nend": 5, "t83": 5, "nretain": 5, "068": 5, "562": 5, "ndividend": 5, "218": 5, "793": 5, "612": 5, "099": 5, "454": 5, "846": 5, "77": [5, 6, 7], "046": 5, "186": 5, "109": 5, "t163": 5, "rsu": 5, "t0": 5, "98": [5, 6, 7], "94": [5, 6, 7, 8], "737": 5, "929": 5, "ndepreci": 5, "445": 5, "519": 5, "688": 5, "038": 5, "266": 5, "227": 5, "006": 5, "788": 5, "356": 5, "271": 5, "520": 5, "618": 5, "484": 5, "731": 5, "684": 5, "499": 5, "020": 5, "889": 5, "448": 5, "552": 5, "031": 5, "t118": 5, "254": 5, "t110": 5, "543": 5, "t122": 5, "151": 5, "48": [5, 7], "656": 5, "513": 5, "76": [5, 8], "923": 5, "nproce": 5, "211": 5, "686": 5, "917": 5, "135": 5, "828": [5, 6], "446": 5, "447": 5, "959": 5, "708": 5, "086": 5, "935": 5, "705": 5, "354": 5, "nfinanc": 5, "441": 5, "431": 5, "223": [5, 8], "234": [5, 8], "025": 5, "841": 5, "nrepurchas": 5, "949": 5, "89": [5, 8], "402": 5, "465": 5, "nrepay": 5, "958": 5, "repay": 5, "978": 5, "955": 5, "361": 5, "581": 5, "160": 5, "121": 5, "983": 5, "488": 5, "794": 5, "760": 5, "nsupplement": 5, "102": 5, "t18": 5, "679": 5, "573": 5, "33": [5, 6, 7, 8], "nbasi": 5, "prior": [5, 8], "reclassifi": 5, "nrevenu": 5, "remit": [5, 8], "straight": 5, "vest": 5, "sold": 5, "nderiv": 5, "nonleas": 5, "34": [5, 6, 8], "entitl": 5, "commenc": 5, "deliveri": 5, "stand": 5, "ssp": 5, "icloud": 5, "siri": 5, "discount": 5, "undeliv": 5, "unbil": 5, "n26": 5, "n37": 5, "moder": [5, 7], "64": [5, 7, 8], "dilut": 5, "nnumer": 5, "ndenomin": 5, "nweight": 5, "312": 5, "316": 5, "856": 5, "antidilut": 5, "tunreal": 5, "ngain": 5, "tfair": 5, "nvalu": 5, "tcash": 5, "nequival": 5, "tcurrent": 5, "tnon": 5, "t27": 5, "nlevel": 5, "nmonei": 5, "t778": 5, "nmutual": 5, "n515": 5, "t105": 5, "t617": 5, "nsubtot": 5, "293": 5, "395": 5, "nu": 5, "treasuri": 5, "516": 5, "t212": 5, "087": 5, "380": 5, "159": 5, "t703": 5, "t17": 5, "568": 5, "158": 5, "810": 5, "ncertif": 5, "deposit": 5, "t873": 5, "t387": 5, "t478": 5, "066": 5, "ncorpor": 5, "t65": 5, "622": 5, "t270": 5, "953": 5, "939": 5, "027": 5, "t47": 5, "886": 5, "nmunicip": 5, "t412": 5, "t405": 5, "t190": 5, "nmortgag": 5, "595": 5, "t175": 5, "403": 5, "t23": 5, "367": 5, "278": [5, 8], "t132": 5, "t583": 5, "635": 5, "t128": 5, "056": 5, "966": 5, "t34": 5, "t160": 5, "t688": 5, "650": 5, "36": [5, 6, 7, 8], "359": [5, 8], "t481": 5, "n442": 5, "t428": 5, "t923": 5, "t909": 5, "406": 5, "114": 5, "468": 5, "136": 5, "t271": 5, "533": 5, "048": [5, 7], "491": 5, "332": 5, "t320": 5, "t608": 5, "t76": 5, "840": 5, "956": 5, "890": 5, "t20": 5, "627": 5, "243": 5, "t628": 5, "t602": 5, "t192": 5, "t410": 5, "735": 5, "636": 5, "t344": 5, "t144": 5, "470": 5, "657": 5, "831": 5, "125": 5, "162": 5, "t173": 5, "752": 5, "corrobor": 5, "mortgag": [5, 6], "classifi": [5, 8], "37": [5, 7, 8], "swap": 5, "remeasur": 5, "notion": 5, "069": 5, "730": 5, "575": 5, "493": 5, "t104": 5, "777": 5, "nhedg": 5, "433": 5, "505": 5, "247": [5, 8], "ntrade": 5, "41": [5, 7, 8], "44": [5, 8], "depreci": 5, "nland": 5, "690": 5, "nmachineri": 5, "t80": 5, "205": [5, 7], "314": 5, "nleasehold": 5, "839": 5, "599": 5, "73": [5, 7, 8], "884": 5, "852": 5, "t55": 5, "906": 5, "601": 5, "703": 5, "010": 5, "457": 5, "634": 5, "391": 5, "neuropean": 5, "opinion": [5, 6, 8], "1991": 5, "2007": 5, "irish": 5, "branch": 5, "2003": 5, "2014": 5, "2015": 5, "minist": 5, "juli": [5, 8], "annul": 5, "ecj": 5, "hear": 5, "asid": 5, "confirm": 5, "unrecogn": [5, 6], "nfeder": 5, "571": 5, "080": 5, "644": 5, "265": 5, "801": 5, "726": 5, "570": 5, "298": 5, "49": [5, 6, 8], "t84": 5, "428": 5, "603": 5, "483": [5, 8], "t347": 5, "t669": 5, "076": 5, "830": 5, "419": 5, "072": 5, "pretax": 5, "72": [5, 6, 8], "ncomput": 5, "885": 5, "012": 5, "124": 5, "518": 5, "nimpact": 5, "246": 5, "311": 5, "366": 5, "397": 5, "nexcess": 5, "893": 5, "871": 5, "192": [5, 8], "739": 5, "ntax": 5, "carryforward": 5, "302": 5, "naccru": 5, "413": [5, 8], "421": 5, "nunreal": 5, "173": 5, "168": 5, "873": 5, "743": 5, "nless": 5, "374": 5, "007": 5, "369": 5, "551": 5, "998": 5, "nright": 5, "179": 5, "nminimum": 5, "674": 5, "940": 5, "t511": 5, "t455": 5, "t490": 5, "805": 5, "202": 5, "indefinit": 5, "temporari": 5, "727": 5, "044": 5, "284": 5, "ndecreas": 5, "386": 5, "463": 5, "982": 5, "542": 5, "936": 5, "070": 5, "expir": 5, "statut": 5, "229": 5, "494": 5, "closur": 5, "intercompani": 5, "exceed": [5, 8], "multiyear": 5, "exercis": 5, "noncash": 5, "rou": 5, "tfinanci": 5, "t2024": 5, "tother": 5, "661": 5, "tproperti": 5, "015": 5, "303": 5, "676": 5, "t165": 5, "t752": 5, "t859": 5, "430": 5, "842": [5, 8], "tfinanc": 5, "n2025": 5, "820": 5, "t171": 5, "991": 5, "n2026": 5, "914": 5, "n2027": 5, "t59": 5, "733": 5, "n2028": 5, "360": 5, "t38": 5, "398": 5, "n2029": 5, "187": 5, "nthereaft": 5, "t837": 5, "undiscount": 5, "790": 5, "imput": 5, "376": 5, "534": 5, "t896": 5, "borrow": 5, "proce": 5, "nine": [5, 8], "nmatur": 5, "333": 5, "264": 5, "948": 5, "645": 5, "309": 5, "arrear": 5, "namount": 5, "n2013": 5, "nfix": 5, "2062": 5, "t97": 5, "341": 5, "03": [5, 6], "65": [5, 8], "t106": 5, "572": 5, "n97": 5, "nunamort": 5, "321": 5, "358": 5, "113": 5, "662": 5, "930": 5, "342": 5, "800": 5, "180": 5, "88": [5, 6], "ndure": 5, "425": 5, "426": 5, "372": 5, "589": 5, "055": 5, "appreci": 5, "four": [5, 7, 8], "holder": [5, 7], "n2014": 5, "bonu": 5, "nrestrict": 5, "nnumber": 5, "nrsu": 5, "ngrant": 5, "naggreg": 5, "nfair": 5, "nbalanc": 5, "t240": 5, "427": [5, 8], "t75": 5, "t150": 5, "861": 5, "501": 5, "768": 5, "87": [5, 6, 7, 8], "101": [5, 8], "878": 5, "144": 5, "t127": 5, "t135": 5, "91": [5, 8], "456": 5, "78": [5, 7, 8], "59": [5, 8], "t140": 5, "326": 5, "t158": 5, "204": 5, "350": 5, "002": [5, 7], "nuncondit": 5, "uncondit": 5, "206": 5, "440": 5, "156": 5, "t633": 5, "t670": 5, "226": 5, "45": 5, "nconting": 5, "accrual": 5, "nconcentr": 5, "attribut": [5, 6, 7, 8, 9], "46": 5, "t67": 5, "098": 5, "082": 5, "062": 5, "569": 5, "895": 5, "458": 5, "207": 5, "nonrecur": 5, "t142": 5, "196": 5, "t138": 5, "t147": 5, "859": 5, "nchina": 5, "n66": 5, "t181": 5, "887": 5, "t172": 5, "269": 5, "nlong": 5, "664": 5, "797": 5, "778": 5, "219": 5, "nopinion": 5, "nwe": 5, "fairli": 5, "pcaob": 5, "sponsor": 5, "treadwai": 5, "2013": 5, "unqualifi": [5, 6], "thereon": 5, "nthese": 5, "misstat": 5, "fraud": [5, 8], "ndescript": 5, "naudit": 5, "nhow": 5, "nmatter": 5, "qualifi": 5, "letter": [5, 6], "advisor": 5, "ernst": 5, "llp": 5, "auditor": [5, 6], "2009": 5, "nsan": 5, "jose": 5, "nnovemb": 5, "coso": 5, "nour": 5, "ndefinit": 5, "disposit": 5, "receipt": 5, "nevalu": 5, "nbase": 5, "supervis": [5, 7, 8, 9], "13a": 5, "15d": 5, "ninher": 5, "paragraph": 5, "51": [5, 8, 9], "ninsid": 5, "deirdr": 5, "brien": 5, "vice": 5, "presid": 5, "affirm": 5, "april": 5, "withhold": 5, "remitt": 5, "mr": 5, "copi": [5, 6], "solicit": 5, "00042": 5, "nincorpor": 5, "texhibit": 5, "descript": [5, 6, 7, 8, 9], "tform": 5, "tfile": 5, "nrestat": 5, "namend": 5, "bylaw": 5, "nindentur": 5, "york": [5, 6, 7, 9], "mellon": 5, "truste": 5, "noffic": 5, "certif": 5, "2018": 5, "85": [5, 7, 8], "05": [5, 6], "2044": 5, "februari": 5, "2045": 5, "900": 5, "700": [5, 7], "250": [5, 8], "2036": 5, "2046": 5, "450": 5, "2047": 5, "2049": 5, "2030": 5, "2050": 5, "2060": 5, "2028": 5, "2041": 5, "2061": 5, "2032": 5, "2052": 5, "54": [5, 6], "2033": 5, "2053": 5, "n12": 5, "nsubsidiari": 5, "n23": 5, "nconsent": 5, "n24": 5, "npower": 5, "signatur": 5, "nrule": 5, "nsection": 5, "1350": 5, "n101": 5, "ninlin": 5, "xbrl": 5, "n104": 5, "inlin": 5, "compensatori": 5, "herewith": 5, "furnish": 5, "herebi": 5, "undertak": 5, "56": [5, 7, 8], "nsignatur": 5, "npursuant": 5, "duli": 5, "undersign": 5, "thereunto": 5, "ndate": 5, "nby": 5, "luca": [5, 9], "maestri": 5, "nluca": 5, "nsenior": 5, "nchief": 5, "nknow": 5, "THESE": 5, "appoint": 5, "cook": 5, "jointli": 5, "her": 5, "substitut": 5, "him": 5, "thereto": 5, "therewith": 5, "ratifi": 5, "virtu": 5, "hereof": 5, "nname": 5, "ttitl": 5, "tdate": 5, "tchief": 5, "tnovemb": 5, "ntimothi": 5, "tsenior": 5, "kondo": 5, "nchri": 5, "wanda": 5, "austin": 5, "nwanda": 5, "gorski": 5, "tdirector": 5, "nalex": 5, "jung": 5, "nandrea": 5, "arthur": 5, "levinson": 5, "narthur": 5, "monica": 5, "lozano": 5, "nmonica": 5, "ronald": 5, "sugar": 5, "nronald": 5, "susan": 5, "wagner": 5, "nsusan": 5, "57": [5, 7], "turbo": [5, 7, 9], "outlin": [5, 7, 8], "invdestacksmeticsisdict": 5, "setispect": 5, "20cyan": 5, "evaluationseld": 5, "anvis": 5, "droitent": 5, "discernminerv": 5, "versbobprefvers": 5, "vo\u8be5": 5, "option\u548c": 5, "meio": 5, "\u0432\u0440\u0435\u043ccisco": 5, "dellaischenpoihscap": 5, "geme": 5, "gettim": 5, "unscal": 5, "vocabulari": [5, 7, 9], "closer": 5, "sharpen": 5, "uniform": 5, "raschka": 5, "repetit": [5, 9], "radic": 5, "grappl": 5, "safer": [5, 8], "fascin": 5, "spontan": 5, "answer": [5, 6, 7, 8, 9], "aren": [5, 7], "linear": 5, "absent": [5, 8], "coax": 5, "journei": 5, "suddenli": 5, "manifest": 5, "deliber": [5, 8], "contend": 5, "rethink": 5, "tutor": 5, "children": [5, 8], "verifi": [5, 6, 7, 9], "predefin": [5, 9], "weren": 5, "kind": [5, 6], "usual": 5, "quantif": 5, "contamin": [5, 8], "unseen": [5, 8], "longitudin": 5, "mostli": [5, 9], "latter": 5, "tailor": [5, 8], "great": [5, 7, 9], "cognit": 5, "misinform": [5, 8], "fabric": [5, 8], "tempor": [5, 6], "disclaim": 5, "referr": 5, "incorrect": [5, 8], "demograph": [5, 8], "stereotyp": [5, 8], "societ": [5, 8], "pii": [5, 8], "anonym": 5, "leakag": [5, 8], "carryov": 5, "fallaci": 5, "think": [5, 7, 8], "idiom": 5, "sarcasm": 5, "terminologi": 5, "lingual": 5, "misunderstand": 5, "syntax": 5, "scan": [5, 6], "compat": [5, 6, 7, 9], "overconfid": [5, 6], "clariti": [5, 6, 8, 9], "audienc": 5, "densiti": 5, "satisfact": [5, 9], "misus": [5, 8], "moral": 5, "co2": 5, "etc": [5, 6, 9], "palm": [5, 7], "easi": [5, 6, 7, 8], "synthet": [5, 7, 8, 9], "timeout": 5, "inter": 5, "rater": 5, "ti": 5, "holist": [5, 8], "experiment": [5, 7, 9], "vi": 5, "categor": [5, 7, 8, 9], "intrins": [5, 7], "extrins": 5, "sequenc": [5, 6, 7, 9], "perplex": [5, 7], "downstream": [5, 9], "synthesi": 5, "discret": 5, "prefix": [5, 8], "roug": 5, "bleu": 5, "bilingu": 5, "understudi": 5, "overlap": [5, 6], "favor": [5, 7, 9], "breviti": 5, "insensit": 5, "semant": [5, 6, 9], "orient": [5, 8], "gist": 5, "meteor": 5, "synonym": 5, "paraphras": 5, "alongsid": [5, 8], "computation": [5, 6], "cider": 5, "consensu": 5, "tf": 5, "idf": 5, "caption": 5, "reliant": [5, 6], "corpu": [5, 6, 7], "ter": 5, "edit": [5, 8], "hypothesi": 5, "penal": 5, "bertscor": 5, "contextu": [5, 8], "bert": 5, "spice": 5, "proposit": [5, 7], "scene": [5, 6, 8], "analyst": [5, 6], "rouge_1": 5, "rouge_2": 5, "ideal": [5, 7, 8, 9], "setup": [5, 7, 8, 9], "evaluate_summari": 5, "unigram": 5, "bigram": 5, "absl": 5, "py": [5, 9], "rouge_scor": 5, "generated_summari": 5, "reference_summari": 5, "google_bleu": 5, "bleu_scor": 5, "rouge1": 5, "rouge2": 5, "arbitrari": 5, "chosen": [5, 8], "sentence1": 5, "cat": [5, 8], "sat": 5, "mat": 5, "sentence2": 5, "ate": 5, "3333333333333333": 5, "7272727272727272": 5, "4444444444444445": 5, "generate_summari": 5, "summir": 5, "liner": 5, "evaluate_summary_model": 5, "model_benchmark": 5, "models_test": 5, "benchmark_summari": 5, "model_summari": 5, "evaluation_result": 5, "statu": 5, "concis": [5, 7], "element": [5, 8, 9], "verbos": [5, 7, 8, 9], "peripher": 5, "quit": [5, 6, 7, 9], "convei": 5, "breadth": 5, "Of": [5, 7, 8], "vibe": 5, "visualize_prompt_comparison": 5, "matplotlib": 5, "radar": 5, "radar_plot": 5, "tmp": 5, "ipykernel_1652501": 5, "940173201": 5, "userwarn": [5, 9], "figurecanvasagg": 5, "largest": [5, 7], "sarmah": 5, "granular": [5, 6, 7], "likert": 5, "ensembl": 5, "repeatedli": [5, 6], "fluenci": 5, "refin": 5, "integ": [5, 9], "rubric": 5, "hollist": 5, "judgeevalu": 5, "grammar": [5, 7, 9], "evaluate_with_llm": 5, "criterion": 5, "judge_model": 5, "candidate_summari": 5, "grammat": 5, "y": [5, 6, 8, 9], "z": 5, "w": [5, 6, 7, 8], "benchmark_model": 5, "test_model": 5, "input_text": [5, 6, 7], "trillion": [5, 7], "evals_list": 5, "1775618912": 5, "slightli": 5, "drift": [5, 8], "lowest": [5, 7], "firstli": 5, "overhead": [5, 7], "egocentr": 5, "tight": 5, "medicin": [5, 8], "glider": 5, "deshpand": 5, "3b": 5, "685": 5, "aplic": 5, "earlier": [5, 8], "depict": [5, 8, 9], "multilingu": [5, 7, 8], "golden": 5, "languang": 5, "arena": 5, "randomli": 5, "customiz": [5, 7, 8], "irrelev": 5, "unhelp": [5, 8], "occasion": 5, "rare": 5, "perfectli": 5, "cater": [5, 7], "critiqu": [5, 8], "elo": 5, "exam": 5, "probe": [5, 8], "certifi": 5, "began": [5, 7], "glue": 5, "entail": [5, 7], "superglu": 5, "successor": 5, "grew": 5, "big": [5, 7], "bench": [5, 7], "srivastava": 5, "truthfulqa": [5, 7], "multitask": 5, "hendryck": [5, 8], "multidisciplinari": 5, "stanford": 5, "helm": 5, "multidimension": 5, "surround": [5, 7, 8, 9], "humanev": [5, 7], "lmsy": 5, "brought": 5, "dialogu": [5, 7], "chiang": 5, "gather": 5, "alpacaev": 5, "duboi": 5, "mt": 5, "argilla": 5, "mila": 5, "mit": [5, 7], "contributor": [5, 7, 9], "western": 5, "centric": 5, "divid": [5, 6, 8], "subset": [5, 8], "agnost": 5, "dialect": 5, "render": [5, 8], "crowdsourc": 5, "livebench": 5, "white": [5, 8], "resili": [5, 6, 8], "meaningfulli": 5, "zebralog": 5, "grid": 5, "puzzl": 5, "brailsford": 5, "1999": 5, "lsat": 5, "hous": 5, "clue": 5, "deduct": 5, "programmat": [5, 9], "2x2": 5, "6x6": 5, "shot": [5, 8, 9], "reductio": 5, "ad": [5, 6, 7, 8, 9], "absurdum": 5, "hard": [5, 6], "10b": 5, "counterfactu": 5, "came": 5, "arc": 5, "prize": [5, 8], "chollet": 5, "mike": [5, 6, 8], "knoop": 5, "founder": 5, "zapier": 5, "fran\u00e7oi": 5, "creator": [5, 7], "agi": 5, "kera": 5, "genuin": 5, "possess": [5, 6], "elementari": 5, "novelti": 5, "interpol": 5, "synthes": 5, "fly": 5, "brute": 5, "pixel": 5, "unbeaten": 5, "win": [5, 7], "poorli": 5, "recombin": 5, "spur": [5, 8], "takeawai": 5, "vertic": [5, 8], "finbench": 5, "legalbench": 5, "guha": 5, "berkelei": 5, "bfcl": 5, "patil": 5, "fourrier": 5, "bespok": 5, "sdk": 5, "autoregress": 5, "sub": [5, 7], "liter": 5, "disturb": 5, "zero": [5, 7, 8, 9], "varianc": [5, 8], "yt": 5, "ut": 5, "ol": 5, "heteroscedast": 5, "regress": 5, "bivari": 5, "evaluation_track": 5, "evaluationtrack": 5, "model_config": 5, "basemodelconfig": 5, "parallelismmanag": 5, "pipelineparamet": 5, "envconfig": 5, "is_accelerate_avail": 5, "datetim": [5, 6], "timedelta": [5, 6], "initprocessgroupkwarg": 5, "create_evaluation_pipelin": 5, "cache_dir": 5, "float16": 5, "max_sampl": 5, "kwargs_handl": 5, "3000": 5, "save_detail": 5, "pipeline_param": 5, "launcher_typ": 5, "env_config": 5, "override_batch_s": 5, "use_chat_templ": 5, "trust_remote_cod": 5, "pipeline_paramet": 5, "schemat": [5, 6], "vllm": [5, 9], "tgi": 5, "num_few_shot": 5, "bar": 5, "bigbench": 5, "winogrand": 5, "hellaswag": 5, "nlp": [5, 6, 7, 8], "save_and_push_result": 5, "show_result": 5, "model_arg": 5, "send": [5, 6, 7, 8, 9], "serverless": 5, "inference_server_address": 5, "inference_server_auth": 5, "model_id": 5, "null": 5, "bash": [5, 7], "command": [5, 6, 7], "model_config_path": 5, "endpoint_model": 5, "llama3": 5, "qwen2": [5, 7, 9], "alibaba": [5, 7, 9], "5b": [5, 7, 9], "hui": [5, 7], "allal": [5, 7], "cluster": 5, "noteworthi": [5, 7], "grain": [5, 7, 9], "salt": [5, 9], "modular": 5, "offici": [5, 9], "revisit": 5, "trace": 5, "langchain_tracing_v2": 5, "langchain_api_kei": 5, "hf_evalu": 5, "langsmith_evalu": 5, "ls_client": 5, "dataset_nam": 5, "create_dataset": 5, "create_exampl": 5, "dataset_id": 5, "calculate_scor": 5, "reference_output": 5, "oai_client": 5, "xp_model_nam": 5, "lastli": 5, "run_evalu": 5, "And": [5, 6, 7, 8], "upload_result": 5, "experiment_prefix": 5, "num_repetit": 5, "386a3620": 5, "9e1cc3cb": 5, "9d6a": 5, "4356": 5, "ab34": 5, "138e0abe8be4": 5, "8741976e": 5, "5268": 5, "4b75": 5, "949f": 5, "99477dde5d64": 5, "selectedsess": 5, "b831dc1e": 5, "90bc": 5, "4ed8": 5, "8080": [5, 7], "fb42444724d6": 5, "4it": 5, "latest": [5, 6, 7, 8, 9], "tobia": [5, 9], "evaluate_modul": 5, "6fc70b7be0088120a372dfdd5d320b39b8bb3630cb8029b193941d9376e86bb0": 5, "tue": 5, "nov": [5, 7], "couldn": 5, "5it": 5, "5053784e": 5, "64445871": 5, "a53c": 5, "44b1": 5, "a422": 5, "4f49b2f9656f": 5, "69": [5, 8], "4b29f3c9": 5, "9ef7e39a": 5, "2add": 5, "410c": 5, "89f8": 5, "9f1a8b198cf1": 5, "61": [5, 8], "insert": [5, 6], "combined_df": 5, "concat": [5, 8], "ignore_index": [5, 8], "execution_tim": 5, "example_id": 5, "333333": 5, "224388": 5, "feb10f92": 5, "3167": 5, "41f3": 5, "bb1c": 5, "d271153a31a8": 5, "5b196b22": 5, "9f4c": 5, "489c": 5, "b020": 5, "7823208b42d6": 5, "348101": 5, "722464": 5, "c310f159": 5, "064a": 5, "4035": 5, "97c3": 5, "a25bbf43abc2": 5, "386076": 5, "704104": 5, "f7f24899": 5, "dd50": 5, "409e": 5, "93cc": 5, "6fb1622b60bf": 5, "443038": 5, "725059": 5, "242856d6": 5, "efb5": 5, "4101": 5, "b1cf": 5, "5805532838ac": 5, "373418": 5, "795302": 5, "ce975169": 5, "a0ab": 5, "40ce": 5, "8e32": 5, "efa28d06079d": 5, "stat": [5, 7], "groupbi": [5, 8], "agg": [5, 8], "sort": 5, "sort_valu": 5, "subplot": 5, "pyplot": 5, "plt": 5, "ax1": 5, "ax2": 5, "figsiz": 5, "2ecc71": 5, "3498db": 5, "e74c3c": 5, "bleu_mean": 5, "bleu_std": 5, "enumer": [5, 6, 8], "errorbar": 5, "yerr": 5, "fmt": 5, "markers": 5, "capsiz": 5, "set_ylabel": 5, "set_titl": 5, "set_xtick": 5, "set_xticklabel": 5, "rotat": 5, "set_ylim": 5, "bottom": 5, "legend": 5, "exec_mean": 5, "exec_std": 5, "tight_layout": 5, "ndetail": 5, "4038": 5, "0453": 5, "7815": 5, "0433": 5, "3768": 5, "0424": 5, "8343": 5, "2208": 5, "3519": 5, "0775": 5, "9122": 5, "1482": 5, "377": 5, "042": 5, "078": 5, "slower": [5, 8], "04": [5, 7], "interestingli": 5, "decoupl": 5, "reload": 5, "facilit": [5, 8], "promptfooconfig": 5, "model_comparison": 5, "pretti": [5, 8], "dump": 5, "default_flow_styl": 5, "sort_kei": 5, "prompt1": 5, "defaulttest": 5, "1000m": 5, "eval_data": 5, "latency_m": 5, "totallatencym": 5, "token_usag": 5, "tokenusag": 5, "assert_pass": 5, "assertpasscount": 5, "assert_fail": 5, "assertfailcount": 5, "prompt_token": [5, 7], "num_request": 5, "numrequest": 5, "2463": 5, "000035": 5, "3773": 5, "004620": 5, "1669": 5, "000091": 5, "1669m": 5, "highest": [5, 7, 9], "3773m": 5, "00462": 5, "promptfool": 5, "manual": [5, 6, 7, 8, 9], "redefin": 5, "prompt_comparison": 5, "prompt2": 5, "prompt3": 5, "prompt_fil": 5, "prompt_cont": 5, "BE": 5, "again": 5, "prompt_id": 5, "promptid": 5, "gradingresult": 5, "df_raw": 5, "reset_index": [5, 8], "eas": [5, 7, 8, 9], "hf": [5, 7], "plain": [5, 6, 7], "vanilla": 5, "defi": 5, "accustom": 5, "legaci": 5, "unsustain": 5, "prd": 5, "cultiv": [5, 8], "organiz": 5, "stagnat": 5, "alb": [5, 7], "loubna": [5, 7], "anton": [5, 7], "lozhkov": [5, 7], "bakouch": [5, 7], "gabriel": [5, 7, 8], "mart\u00edn": [5, 7, 8], "bl\u00e1zquez": [5, 7], "lewi": [5, 6, 7], "tunstal": [5, 7], "agust\u00edn": [5, 7], "piquer": [5, 7], "andr": [5, 6, 7], "marafioti": [5, 7], "cyril": [5, 7], "zakka": [5, 7], "leandro": [5, 7], "werra": [5, 7], "wolf": [5, 7], "are24": 5, "judgearena": 5, "bps99": 5, "salli": 5, "pott": 5, "barbara": 5, "557": [5, 8], "sciencedirect": 5, "s0377221798003646": 5, "doi": [5, 6, 8, 9], "1016": 5, "s0377": 5, "2217": 5, "00364": 5, "ctj": 5, "jerri": [5, 8], "tworek": [5, 8], "heewoo": [5, 8], "jun": [5, 8], "qime": [5, 8], "henriqu": [5, 8], "pond": [5, 8], "de": [5, 8], "oliveira": [5, 8], "pinto": [5, 8], "harri": [5, 8], "yuri": 5, "burda": 5, "greg": [5, 8], "brockman": [5, 8], "raul": [5, 8], "puri": [5, 8], "gretchen": [5, 8], "krueger": [5, 8], "petrov": [5, 8], "heidi": 5, "khlaaf": 5, "girish": [5, 8], "sastri": [5, 8], "brook": [5, 8], "chan": [5, 8], "grai": [5, 8], "ryder": [5, 8], "mikhail": [5, 8], "pavlov": [5, 8], "alethea": [5, 8], "lukasz": 5, "kaiser": [5, 8], "mohammad": [5, 8], "bavarian": [5, 8], "clemen": [5, 8], "winter": [5, 8], "philipp": 5, "tillet": [5, 8], "felip": [5, 8], "petroski": [5, 8], "dave": [5, 8], "cum": [5, 8], "plappert": 5, "fotio": 5, "chantzi": [5, 8], "barn": 5, "ariel": 5, "herbert": 5, "voss": [5, 8], "hebgen": 5, "guss": 5, "nichol": 5, "paino": [5, 8], "nikola": [5, 8], "tezak": [5, 8], "babuschkin": [5, 8], "suchir": [5, 8], "balaji": [5, 8], "shantanu": [5, 8], "jain": [5, 8], "hess": [5, 8], "carr": 5, "josh": [5, 8], "achiam": [5, 8], "vedant": 5, "misra": 5, "evan": [5, 7, 8], "morikawa": [5, 8], "matthew": 5, "knight": [5, 8], "mile": [5, 8], "brundag": [5, 8], "mira": [5, 8], "murati": [5, 8], "kati": [5, 8], "mayer": [5, 8], "bob": [5, 8, 9], "mcgrew": [5, 8], "ilya": [5, 8], "sutskev": [5, 8], "wojciech": [5, 8], "zaremba": [5, 8], "2107": 5, "03374": 5, "cz": 5, "lianmin": 5, "ying": 5, "sheng": 5, "anastasio": 5, "angelopoulo": 5, "tianl": 5, "dacheng": 5, "banghua": 5, "jordan": [5, 8], "gonzalez": 5, "ion": 5, "stoica": 5, "04132": 5, "cho24a": 5, "francoi": 5, "arcpriz": 5, "cho24b": 5, "drcw": 5, "darshan": 5, "selvan": 5, "sunitha": 5, "ravi": 5, "sky": 5, "ch": 5, "bartosz": 5, "mielczarek": 5, "anand": [5, 8], "kannappan": [5, 8], "qian": [5, 8], "14140": 5, "dglh24": 5, "yann": 5, "bal\u00e1z": 5, "galambosi": 5, "tatsunori": 5, "hashimoto": 5, "debia": 5, "04475": 5, "fac24a": 5, "wiki": [5, 9], "fac24b": 5, "fac24c": 5, "model_doc": 5, "fac24d": 5, "cookbook": 5, "llm_judg": 5, "fac24f": 5, "fhwt23": 5, "cl\u00e9mentin": 5, "nathan": 5, "habib": 5, "gnh": 5, "julian": 5, "nyarko": 5, "ho": 5, "r\u00e9": 5, "adam": [5, 8], "chilton": 5, "aditya": [5, 8], "narayana": 5, "chohla": 5, "brandon": [5, 8, 9], "waldon": 5, "rockmor": 5, "diego": 5, "zambrano": 5, "dmitri": 5, "talisman": 5, "enam": 5, "hoqu": 5, "faiz": 5, "surani": 5, "frank": [5, 8], "fagan": 5, "galit": 5, "sarfati": 5, "gregori": 5, "dickinson": 5, "haggai": 5, "porat": 5, "hegland": 5, "jessica": [5, 8], "joe": [5, 8], "nudel": 5, "joel": [5, 8], "niklau": 5, "nai": 5, "choi": 5, "margaret": [5, 7], "hagan": 5, "megan": 5, "livermor": 5, "nikon": 5, "rasumov": 5, "rahe": 5, "nil": 5, "holzenberg": 5, "noam": 5, "kolt": 5, "henderson": 5, "rehaag": 5, "sharad": 5, "shang": 5, "spencer": 5, "sunni": 5, "gandhi": 5, "zur": 5, "varun": 5, "iyer": 5, "zehua": 5, "2308": 5, "11462": 5, "hbb": 5, "collin": 5, "burn": 5, "steven": [5, 8], "basart": [5, 8], "zou": [5, 8], "manta": [5, 8], "mazeika": [5, 8], "03300": 5, "hbd": 5, "maxwel": 5, "forb": 5, "yejin": 5, "curiou": 5, "neural": [5, 9], "degener": 5, "1904": 5, "09751": 5, "hyc": [5, 7], "binyuan": [5, 7], "zeyu": [5, 7], "cui": [5, 7], "jiaxi": [5, 7], "dayiheng": [5, 7], "tianyu": [5, 7], "jiajun": [5, 7], "kai": [5, 7, 8], "dang": [5, 7], "coder": [5, 7], "preprint": [5, 7, 9], "2409": [5, 7, 8], "12186": [5, 7], "lx": 5, "zhen": 5, "xiaohan": 5, "jia": 5, "yuxuan": 5, "lai": 5, "chongyang": 5, "shuai": 5, "nlg": 5, "07103": 5, "lbl": 5, "bommasani": 5, "toni": 5, "dimitri": 5, "tsipra": 5, "dilara": 5, "soylu": 5, "michihiro": 5, "yasunaga": 5, "yian": 5, "deepak": 5, "narayanan": 5, "yuhuai": 5, "newman": 5, "binhang": 5, "bobbi": 5, "ce": 5, "christian": [5, 8], "cosgrov": 5, "acosta": 5, "nava": [5, 8], "drew": 5, "hudson": 5, "zelikman": 5, "esin": 5, "durmu": 5, "faisal": 5, "ladhak": 5, "frieda": 5, "rong": [5, 6], "ren": [5, 7], "huaxiu": 5, "yao": [5, 8, 9], "jue": 5, "keshav": 5, "santhanam": 5, "laurel": 5, "lucia": 5, "mert": 5, "yuksekgonul": 5, "mirac": 5, "suzgun": 5, "niladri": 5, "chatterji": 5, "omar": 5, "khattab": 5, "chi": [5, 9], "sang": 5, "shibani": [5, 8], "santurkar": [5, 8], "surya": 5, "icard": 5, "tianyi": 5, "vishrav": 5, "chaudhari": 5, "xuechen": 5, "yuhui": 5, "yuta": 5, "koreeda": 5, "2211": 5, "09110": 5, "lbc24": 5, "ronan": 5, "bra": 5, "allenai": 5, "lhe22": [5, 7, 8], "stephani": [5, 7, 8], "owain": [5, 7, 8], "mimic": [5, 7, 8], "falsehood": [5, 7, 8], "2109": [5, 7, 8], "07958": [5, 7, 8], "pzwg23": 5, "shishir": 5, "tianjun": 5, "xin": [5, 8], "gorilla": 5, "15334": 5, "pro24": 5, "dev": 5, "ras24": 5, "sebastian": [5, 6], "scratch": 5, "1633437166": 5, "sll": 5, "bhaskarjit": 5, "mingshu": 5, "jingrao": 5, "lyu": 5, "nathalia": 5, "castellano": 5, "pasquali": 5, "dhagash": 5, "12148": 5, "srf": 5, "shivalika": 5, "angelika": 5, "roman": [5, 8], "adelani": 5, "ngui": 5, "vila": 5, "suero": 5, "peerat": 5, "limkonchotiwat": 5, "kelli": 5, "marchisio": 5, "qi": 5, "leong": 5, "yosephin": 5, "susanto": 5, "raymond": [5, 8], "ng": [5, 8], "shayn": 5, "longpr": 5, "ko": 5, "madelin": 5, "antoin": 5, "bosselut": 5, "oh": 5, "leshem": 5, "choshen": 5, "daphn": 5, "ippolito": 5, "enzo": [5, 9], "ferrant": 5, "marzieh": 5, "fadae": 5, "beyza": 5, "ermi": 5, "sara": 5, "hooker": 5, "linguist": [5, 6, 8], "03304": 5, "srr": 5, "aarohi": 5, "abhinav": 5, "rastogi": 5, "abhishek": 5, "rao": 5, "abu": 5, "awal": 5, "shoeb": 5, "abubakar": 5, "abid": [5, 7], "fisch": 5, "santoro": 5, "gupta": 5, "adri\u00e0": 5, "garriga": 5, "alonso": 5, "agnieszka": 5, "kluska": 5, "aitor": 5, "lewkowycz": 5, "akshat": 5, "warstadt": 5, "alexand": [5, 8, 9], "kocurek": 5, "ali": [5, 8], "safaya": 5, "tazarv": 5, "aman": 5, "hussain": 5, "dsouza": 5, "ambros": 5, "slone": 5, "ameet": 5, "rahan": 5, "anantharaman": 5, "ander": 5, "andreassen": 5, "madotto": 5, "santilli": 5, "stuhlm\u00fcller": 5, "la": 5, "lampinen": 5, "angelica": 5, "anh": 5, "vuong": 5, "animesh": 5, "gottardi": 5, "antonio": 5, "norelli": 5, "anu": 5, "venkatesh": 5, "arash": 5, "gholamidavoodi": 5, "arfa": 5, "tabassum": 5, "arul": 5, "menez": 5, "arun": [5, 8], "kirubarajan": 5, "asher": 5, "mullokandov": 5, "ashish": 5, "sabharw": 5, "herrick": 5, "avia": 5, "efrat": 5, "aykut": 5, "erdem": 5, "ayla": 5, "karaka\u015f": 5, "bao": [5, 7, 8], "loe": 5, "barret": [5, 8], "zoph": [5, 8], "bart\u0142omiej": 5, "bojanowski": 5, "batuhan": 5, "\u00f6zyurt": 5, "behnam": 5, "hedayatnia": 5, "neyshabur": 5, "inden": 5, "benno": 5, "stein": 5, "berk": 5, "ekmekci": 5, "blake": 5, "howald": 5, "bryan": 5, "orinion": 5, "diao": 5, "dour": 5, "stinson": 5, "cedrick": 5, "argueta": 5, "c\u00e9sar": 5, "ferri": 5, "ram\u00edrez": 5, "chandan": 5, "charl": 5, "rathkopf": 5, "chenlin": 5, "meng": 5, "chitta": 5, "baral": 5, "chiyu": 5, "callison": 5, "burch": 5, "voigt": 5, "cindi": 5, "ramirez": 5, "clara": 5, "rivera": 5, "clemencia": 5, "siro": 5, "colin": [5, 7], "raffel": [5, 7], "courtnei": 5, "ashcraft": 5, "cristina": 5, "garbacea": 5, "damien": [5, 8], "sileo": 5, "garrett": 5, "kilman": 5, "freeman": 5, "khashabi": 5, "levi": [5, 8], "mosegu\u00ed": 5, "gonz\u00e1lez": 5, "perszyk": 5, "danqi": 5, "dar": 5, "gilboa": 5, "dohan": [5, 8], "drakard": 5, "jurgen": 5, "debajyoti": 5, "datta": 5, "deni": 5, "emelin": 5, "kleyko": 5, "deniz": 5, "yuret": 5, "derek": [5, 8], "tam": [5, 9], "dieuwk": 5, "hupk": 5, "diganta": 5, "dilyar": 5, "buzan": 5, "coelho": 5, "mollo": 5, "diyi": 5, "dylan": 5, "schrader": 5, "ekaterina": 5, "shutova": 5, "ekin": 5, "dogu": 5, "cubuk": 5, "elad": 5, "segal": 5, "eleanor": 5, "hagerman": 5, "donowai": 5, "elli": 5, "pavlick": 5, "rodola": 5, "emma": 5, "lam": 5, "chu": [5, 8], "erkut": 5, "erni": 5, "dyer": 5, "jerzak": 5, "eunic": 5, "engefu": 5, "manyasi": 5, "evgenii": 5, "zheltonozhskii": 5, "fanyu": 5, "fatemeh": 5, "siar": 5, "fernando": 5, "mart\u00ednez": 5, "plume": 5, "francesca": 5, "happ\u00e9": 5, "gaurav": 5, "genta": 5, "indra": 5, "winata": 5, "gerard": 5, "melo": 5, "germ\u00e1n": 5, "kruszewski": 5, "giambattista": [5, 8], "parascandolo": [5, 8], "giorgio": 5, "mariani": 5, "gloria": 5, "gonzalo": 5, "jaimovitch": 5, "l\u00f3pez": 5, "gregor": 5, "betz": 5, "gui": [5, 7], "gur": 5, "hana": 5, "galijasev": 5, "rashkin": 5, "hannaneh": 5, "hajishirzi": 5, "harsh": 5, "hayden": 5, "bogar": 5, "henri": [5, 8], "shevlin": 5, "hinrich": 5, "sch\u00fctze": 5, "hiromu": 5, "yakura": 5, "hongm": 5, "hugh": 5, "mee": 5, "wong": [5, 6, 8], "isaac": 5, "nobl": 5, "jaap": 5, "jumelet": 5, "geissing": 5, "jaehoon": 5, "jaim": 5, "fern\u00e1ndez": 5, "fisac": 5, "simon": 5, "koppel": 5, "koco\u0144": 5, "jana": 5, "thompson": [5, 7, 8], "janel": 5, "wingfield": 5, "jarema": 5, "radom": 5, "jascha": 5, "sohl": [5, 8], "dickstein": 5, "phang": 5, "yosinski": 5, "jekaterina": 5, "novikova": 5, "jell": 5, "bosscher": 5, "jennif": 5, "marsh": 5, "jeroen": 5, "taal": 5, "engel": 5, "jesujoba": 5, "alabi": 5, "jiam": 5, "jillian": 5, "joan": 5, "waweru": 5, "burden": 5, "bali": 5, "batcheld": 5, "berant": 5, "j\u00f6rg": 5, "frohberg": 5, "jo": 5, "rozen": 5, "orallo": 5, "boudeman": 5, "guerr": 5, "tenenbaum": 5, "joyc": 5, "chua": 5, "kanclerz": 5, "karen": 5, "livescu": 5, "karl": 5, "krauth": 5, "karthik": 5, "gopalakrishnan": 5, "katerina": 5, "ignatyeva": 5, "katja": 5, "markert": 5, "kaustubh": 5, "dhole": 5, "gimpel": 5, "omondi": 5, "kori": 5, "mathewson": 5, "kristen": 5, "chiafullo": 5, "ksenia": 5, "shkaruta": 5, "shridhar": 5, "kyle": [5, 6, 8], "mcdonel": 5, "richardson": 5, "laria": 5, "reynold": 5, "leo": [5, 8], "dugan": 5, "lianhui": 5, "lidia": 5, "contrera": 5, "ochando": 5, "morenc": 5, "moschella": 5, "luci": 5, "ludwig": 5, "schmidt": [5, 8], "luheng": 5, "olivero": 5, "col\u00f3n": 5, "metz": [5, 8], "l\u00fctfi": 5, "kerem": 5, "\u015fenel": 5, "maarten": [5, 8], "bosma": 5, "sap": [5, 8], "maartj": 5, "hoev": 5, "maheen": 5, "farooqi": 5, "manaal": 5, "faruqui": 5, "marco": 5, "baturan": 5, "marelli": 5, "maru": 5, "maria": 5, "quintana": 5, "tolkiehn": 5, "mario": [5, 8], "giulianelli": 5, "martha": 5, "potthast": 5, "leavitt": 5, "hagen": 5, "m\u00e1ty\u00e1": 5, "schubert": 5, "medina": [5, 8], "orduna": 5, "baitemirova": 5, "melodi": 5, "arnaud": 5, "melvin": 5, "mcelrath": 5, "yee": 5, "cohen": 5, "ivanitskii": 5, "starritt": 5, "strube": 5, "micha\u0142": 5, "sw\u0119drowski": 5, "michel": [5, 8], "bevilacqua": 5, "mihir": 5, "kale": 5, "cain": 5, "mime": 5, "mitch": 5, "walker": 5, "mo": 5, "tiwari": 5, "mohit": 5, "bansal": 5, "moin": 5, "aminnaseri": 5, "mor": 5, "geva": 5, "mozhdeh": 5, "gheini": 5, "mukund": 5, "varma": 5, "nanyun": 5, "peng": [5, 8], "nayeon": 5, "neta": 5, "krakov": 5, "doiron": 5, "nicol": 5, "martinez": 5, "nikita": 5, "nangia": 5, "nikla": 5, "decker": 5, "muennighoff": 5, "nitish": [5, 8], "shirish": [5, 8], "keskar": [5, 8], "niveditha": 5, "constant": 5, "fiedel": 5, "nuan": 5, "wen": [5, 6], "oliv": [5, 8], "agha": 5, "elbaghdadi": 5, "omer": 5, "moreno": 5, "casar": 5, "parth": 5, "doshi": 5, "pascal": 5, "fung": 5, "pu": 5, "vicol": 5, "pegah": 5, "alipoormolabashi": 5, "peiyuan": 5, "eckerslei": 5, "phu": 5, "mon": 5, "htut": 5, "pinyu": 5, "hwang": 5, "piotr": 5, "mi\u0142kowski": 5, "piyush": 5, "pouya": 5, "pezeshkpour": 5, "priti": 5, "oli": 5, "qiaozhu": 5, "qing": 5, "qinlang": 5, "rabin": 5, "banjad": 5, "rachel": [5, 8], "etta": 5, "rudolph": 5, "raefer": 5, "rahel": 5, "haback": 5, "ramon": 5, "risco": 5, "rapha\u00ebl": 5, "milli\u00e8r": 5, "rhythm": 5, "garg": [5, 7], "rif": 5, "saurou": 5, "riku": 5, "arakawa": 5, "robb": 5, "raymaek": 5, "rohan": 5, "sikand": 5, "novak": 5, "sitelew": 5, "lebra": 5, "rosann": 5, "rowan": [5, 8], "ruslan": 5, "salakhutdinov": 5, "stoval": 5, "teehan": 5, "sahib": 5, "saif": 5, "sajant": 5, "dillav": 5, "shleifer": 5, "wiseman": 5, "gruetter": 5, "schoenholz": 5, "sanghyun": 5, "sanjeev": 5, "kwatra": 5, "sarik": 5, "ghazarian": 5, "sayan": 5, "casei": [5, 8], "bischoff": 5, "gehrmann": 5, "schuster": 5, "sepideh": 5, "sadeghi": 5, "shadi": 5, "hamdan": 5, "sharon": 5, "shashank": 5, "sherri": 5, "shi": 5, "shikhar": 5, "shima": 5, "asaadi": 5, "shubh": 5, "pachchigar": 5, "shubham": 5, "toshniw": 5, "shyam": [5, 8], "upadhyai": 5, "shyamolima": 5, "debnath": 5, "siamak": 5, "shakeri": 5, "thormey": 5, "melzi": 5, "siva": 5, "reddi": 5, "sneha": 5, "priscilla": 5, "makini": 5, "soo": 5, "hwan": 5, "toren": 5, "sriharsha": 5, "hatwar": 5, "stanisla": 5, "dehaen": 5, "stefan": 5, "divic": 5, "stella": 5, "biderman": 5, "stephen": 5, "prasad": 5, "piantadosi": 5, "stuart": [5, 8], "shieber": 5, "summer": [5, 8], "misherghi": 5, "svetlana": 5, "kiritchenko": 5, "swaroop": 5, "tal": 5, "linzen": 5, "tariq": 5, "tatsu": 5, "te": 5, "th\u00e9o": 5, "desbord": 5, "theodor": 5, "rothschild": 5, "phan": [5, 8], "tiberiu": 5, "nkinyili": 5, "timo": 5, "schick": 5, "timofei": 5, "kornev": 5, "titu": 5, "tunduni": 5, "gerstenberg": 5, "trenton": 5, "trishala": 5, "neeraj": 5, "tushar": 5, "khot": 5, "shultz": 5, "uri": 5, "shaham": 5, "vera": 5, "demberg": 5, "victoria": [5, 8], "nyamai": 5, "vika": 5, "raunak": 5, "vinai": 5, "ramasesh": 5, "udai": 5, "prabhu": 5, "vishakh": 5, "padmakumar": 5, "vivek": [5, 6], "srikumar": [5, 6], "fedu": [5, 8], "wout": 5, "vossen": 5, "xiaoyu": 5, "tong": [5, 8], "xinran": 5, "xinyi": 5, "yadollah": 5, "yaghoobzadeh": 5, "yair": 5, "lakretz": 5, "yangqiu": 5, "yasaman": 5, "bahri": 5, "yichi": 5, "yide": 5, "yifu": 5, "yonatan": 5, "belinkov": 5, "yufang": 5, "seid": 5, "zhuoy": 5, "zijian": 5, "ziji": 5, "zirui": 5, "ziyi": 5, "extrapol": 5, "2206": 5, "04615": 5, "wpn": 5, "yada": 5, "pruksachatkun": 5, "amanpreet": 5, "hill": 5, "stickier": 5, "wsm": 5, "1804": 5, "07461": 5, "wtb": 5, "tai": 5, "borgeaud": 5, "dani": 5, "yogatama": 5, "denni": [5, 8], "donald": 5, "metzler": 5, "ed": 5, "oriol": 5, "vinyal": 5, "dean": 5, "07682": 5, "wdr": 5, "doolei": 5, "manlei": 5, "arka": [5, 8], "pal": 5, "feuer": 5, "siddhartha": 5, "ravid": 5, "shwartz": [5, 8], "ziv": 5, "khalid": [5, 7], "saifullah": 5, "siddartha": 5, "naidu": 5, "chinmai": 5, "hegd": 5, "lecun": 5, "goldstein": 5, "willi": 5, "neiswang": 5, "micah": 5, "goldblum": 5, "19314": 5, "yyh": 5, "baosong": [5, 7], "chengpeng": 5, "chengyuan": [5, 7], "fei": [5, 7], "guant": 5, "haoran": [5, 7], "huan": [5, 7], "jialong": 5, "jialin": 5, "jianhong": [5, 7], "tu": [5, 7], "jianwei": [5, 7], "jianxin": [5, 7], "jin": [5, 6, 8], "jingren": [5, 7], "jinz": 5, "jinzheng": 5, "junyang": [5, 7], "keme": [5, 7], "keqin": [5, 7], "kexin": [5, 7], "mingfeng": [5, 7], "xue": [5, 7, 8], "ni": [5, 6], "pei": [5, 7], "ru": 5, "men": [5, 7], "ruiz": 5, "runji": [5, 7], "shiji": 5, "sinan": 5, "tianhang": 5, "wenbin": 5, "ge": 5, "xiaodong": 5, "deng": 5, "xiaohuan": 5, "xingzhang": [5, 7], "xinyu": [5, 8], "xipin": 5, "xuancheng": [5, 7], "yichang": [5, 7], "wan": [5, 7], "yunfei": 5, "yuqiong": [5, 7], "zhenru": [5, 7], "zhihao": 5, "10671": 5, "zcl24": 5, "zhihan": 5, "cao": 5, "lizi": 5, "openreview": [5, 6], "forum": [5, 6], "aegrf1uy0p": 5, "zc": 5, "siyuan": 5, "zhuang": [5, 8], "zhanghao": 5, "yonghao": 5, "zi": 5, "zhuohan": 5, "xing": [5, 8], "2306": 5, "05685": 5, "huggingface24": 5, "metaai24": 5, "doubl": 6, "steve": [6, 8], "lclm": 6, "simultan": [6, 7, 8], "cutoff": 6, "stale": 6, "amayuela": 6, "tail": 6, "kotha": 6, "unifi": [6, 7, 9], "codebas": [6, 7], "verif": [6, 7, 9], "ingest": 6, "preprocess": [6, 7, 9], "parser": [6, 9], "microsoft": [6, 7], "autogen": 6, "powerpoint": 6, "ocr": 6, "exif": 6, "metadata": [6, 7], "docker": [6, 7], "container": [6, 7], "xlsx": 6, "text_cont": 6, "ibm": [6, 7, 8], "docx": 6, "pptx": 6, "layout": 6, "llamaindex": 6, "document_convert": 6, "documentconvert": 6, "export_to_markdown": 6, "presenc": 6, "merril": 6, "lynch": 6, "cio": 6, "outlook": 6, "forecast_file_path": 6, "result_md": 6, "forecast_result_docl": 6, "levenshtein": 6, "distanc": 6, "sequencematch": 6, "difflib": 6, "longest": 6, "levenshtein_similar": 6, "text1": 6, "text2": 6, "max_len": 6, "simple_similar": 6, "ratio": [6, 7], "forecast_result_md": 6, "13985705461925346": 6, "17779960707269155": 6, "readabl": 6, "messi": 6, "2025e": 6, "compos": [6, 7, 8], "financial_vari": 6, "financial_forecast": 6, "econforecast": 6, "extract_prompt": 6, "base_prompt": [6, 9], "extract_from_doc": 6, "twice": 6, "md_financi": 6, "docling_financi": 6, "easier": [6, 7, 8, 9], "gdp": 6, "cpi": 6, "fed": 6, "df_md_forecast": 6, "df_docling_forecast": 6, "despit": [6, 7, 9], "underweight": 6, "neutral": [6, 8], "overweight": 6, "chart": 6, "asset_class_docl": 6, "asset_class_md": 6, "df_md": 6, "df_docl": 6, "true_valu": 6, "df_comparison": 6, "cap": 6, "exempt": 6, "markitdown_accuraci": 6, "docling_accuraci": 6, "93": [6, 7, 8], "sector": 6, "convert_and_export_t": 6, "file_path": 6, "doc_convert": 6, "start_tim": [6, 8], "conv_r": 6, "table_df": 6, "export_to_datafram": 6, "end_tim": 6, "2f": 6, "usd": 6, "wtd": 6, "mtd": 6, "ytd": 6, "djia": 6, "926": 6, "amp": 6, "051": 6, "277": 6, "russel": [6, 8], "2000": 6, "msci": 6, "817": [6, 8], "eaf": 6, "319": 6, "107": 6, "01": [6, 7], "66": [6, 8], "92": 6, "municip": 6, "79": [6, 8], "slight": 6, "discretionari": 6, "yellow": 6, "estat": 6, "orang": 6, "stapl": 6, "constructor": 6, "md_llm": 6, "llm_client": 6, "llm_model": 6, "png": 6, "overview": [6, 9], "showcas": 6, "bond": 6, "crude": 6, "oil": 6, "sit": 6, "648": 6, "ounc": 6, "euro": 6, "tactic": 6, "bofa": 6, "circl": [6, 8], "fetch": 6, "reassembl": 6, "max_output_token": 6, "statement": [6, 8], "10k": 6, "diagram": [6, 8], "charactertextsplitt": 6, "tiktoken": [6, 8], "sequenti": 6, "newlin": 6, "cheap": 6, "speciali": 6, "nltk": 6, "spaci": 6, "hierarch": [6, 8], "talk": 6, "theme": [6, 7, 8], "splitter": 6, "surpass": 6, "get_chunk": 6, "chunk_siz": 6, "chunk_overlap": 6, "langchain_text_splitt": 6, "text_splitt": 6, "from_tiktoken_encod": 6, "split_text": 6, "persona": 6, "langchain_cor": [6, 9], "prompttempl": 6, "get_base_prompt_templ": 6, "from_templ": 6, "llmchain": 6, "output_pars": 6, "stroutputpars": 6, "langchain_commun": 6, "chat_model": 6, "chatlitellm": 6, "get_llm_chain": 6, "prompt_templ": [6, 9], "llm_chain": [6, 9], "api_key_label": 6, "upper": 6, "_api_kei": 6, "get_dynamic_prompt_templ": 6, "get_dynamic_prompt_param": 6, "prompt_param": 6, "part_idx": 6, "total_part": 6, "chat_context": 6, "param": 6, "dynamic_prompt_param": 6, "concaten": 6, "generate_report": 6, "input_cont": 6, "llm_model_nam": 6, "report_part": 6, "num_part": 6, "dinam": 6, "priovid": 6, "invok": [6, 9], "cummul": 6, "max_chunk_s": 6, "max_chunk_overlap": 6, "gemini": [6, 7], "apple_report": 6, "report_cont": 6, "report_lin": 6, "splitlin": 6, "total_lin": 6, "quarter_lin": 6, "top_port": 6, "bottom_port": 6, "uncov": [6, 8, 9], "delv": 6, "consol": 6, "fewer": [6, 7, 8], "reaction": 6, "breakdown": [6, 8], "disciplin": 6, "appar": [6, 8], "subhead": 6, "depth": [6, 8], "2m": [6, 7], "harvard": [6, 7], "enrol": 6, "gov": [6, 8], "1039": 6, "birth": [6, 8], "democraci": 6, "tuesdai": 6, "magna": 6, "carta": 6, "trudg": 6, "dens": 6, "conversation": 6, "knowledge_bas": 6, "add_knowledge_bas": 6, "add_cit": 6, "bool": [6, 8], "num_quest": 6, "input_memori": 6, "response_memori": 6, "urls_memori": 6, "extractor": 6, "cic": 6, "citabl": 6, "passag": [6, 8], "corpora": 6, "formatted_cont": 6, "reference_id": 6, "wrapper": [6, 9], "content_gener": 6, "user_instruct": 6, "llmbackend": 6, "cache_ttl": 6, "cachedcont": 6, "display_nam": 6, "due_knowledge_bas": 6, "system_instruct": 6, "compose_prompt": 6, "conversation_config": 6, "ttl": 6, "generativemodel": 6, "from_cached_cont": 6, "cached_cont": 6, "behind": [6, 8], "quiz_inst": 6, "professor": 6, "difficulti": [6, 8], "dataset": [6, 9], "syllabu": 6, "kennedi": 6, "inaugur": 6, "lincoln": 6, "gettysburg": 6, "liberti": 6, "mayflow": 6, "abraham": 6, "gutenberg": 6, "kb": 6, "epub": 6, "pg": 6, "gemini_duo": 6, "genai_duo": 6, "duo": 6, "usage_metadata": 6, "38470": 6, "anytim": 6, "shap": 6, "mckechni": 6, "study_refer": 6, "pg10000": 6, "65363": 6, "pg65363": 6, "quizz": 6, "problemat": [6, 8], "misinterpret": 6, "awp": 6, "alfonso": 6, "liangm": 6, "pan": [6, 8], "wenhu": 6, "lun": 6, "ku": 6, "editor": [6, 8], "acl": [6, 8], "6416": 6, "6432": 6, "bangkok": 6, "thailand": 6, "aclanthologi": [6, 8], "383": 6, "18653": [6, 8], "v1": [6, 7, 8], "ksr24": 6, "suha": 6, "springer": 6, "aditi": 6, "raghunathan": 6, "twelfth": 6, "vrhif2hsrm": 6, "lcd": 6, "jinhyuk": 6, "zhuyun": 6, "dheeru": 6, "dua": 6, "devendra": 6, "sachan": 6, "boratko": 6, "luan": 6, "s\u00e9bastien": 6, "arnold": 6, "vincent": 6, "perot": 6, "siddharth": 6, "dalmia": 6, "hexiang": 6, "panupong": 6, "pasupat": 6, "aida": 6, "amini": 6, "cole": 6, "riedel": 6, "iftekhar": 6, "naim": 6, "ming": [6, 8], "guu": 6, "subsum": 6, "sql": 6, "13121": 6, "lpp": 6, "aleksandra": 6, "piktu": 6, "fabio": [6, 8], "petroni": 6, "vladimir": 6, "karpukhin": 6, "heinrich": 6, "k\u00fcttler": 6, "tau": 6, "yih": 6, "rockt\u00e4schel": 6, "douw": 6, "kiela": 6, "2005": 6, "11401": 6, "nbgc24": 6, "shiyu": 6, "kepe": 6, "bi": 6, "jiafeng": 6, "guo": [6, 8], "xueqi": 6, "cheng": [6, 9], "11375": 6, "11388": 6, "675": 6, "tdw": 6, "jiejun": 6, "zhicheng": 6, "dou": 6, "mang": 6, "weipeng": 6, "ji": 6, "htmlrag": 6, "02959": 6, "zlj": 6, "jiaji": 6, "yun": [6, 9], "metacognit": 6, "1453": 6, "1463": 6, "ny": [6, 8, 9], "usa": [6, 8, 9], "machineri": [6, 9], "1145": [6, 8, 9], "3589334": 6, "3645481": 6, "anthropic24": [6, 8], "langchain24": 6, "how_to": 6, "merrilllynch24": 6, "weekli": 6, "olui2": 6, "gwmol": 6, "di": 7, "hunter": 7, "photo": 7, "email": 7, "hipaa": 7, "properti": [7, 8], "gdpr": 7, "strict": [7, 8, 9], "iot": 7, "unreli": 7, "impract": 7, "slm": 7, "viabl": 7, "sensor": 7, "interconnect": 7, "frontend": 7, "garner": 7, "traction": 7, "yourself": 7, "aw": [7, 8], "bedrock": 7, "sambanova": 7, "sla": 7, "veloc": 7, "roadmap": 7, "commodit": 7, "winner": 7, "loser": 7, "condens": 7, "clean": 7, "2024t": 7, "versatil": 7, "72b": 7, "med": 7, "bloomberggpt": 7, "underw": 7, "adept": 7, "toxigen": 7, "alnajjar": 7, "13b": [7, 8], "outperform": 7, "32b": 7, "feasibl": 7, "modal": 7, "diagnosi": 7, "patient": 7, "necessit": 7, "deepseek": 7, "flagship": 7, "405b": 7, "pack": 7, "v3": [7, 8], "671": 7, "moe": 7, "mixtur": 7, "3x": [7, 8], "v2": [7, 8], "fraction": 7, "mileston": 7, "domin": 7, "tech": 7, "cautiou": 7, "cautious": 7, "isol": [7, 8], "cpot": 7, "cpit": 7, "tco": 7, "tpot": 7, "ttft": 7, "gpqa": 7, "median": 7, "afford": 7, "lite": 7, "micro": 7, "encod": [7, 8, 9], "cent": 7, "1m": 7, "cheapest": 7, "phi": 7, "half": [7, 8], "permiss": [7, 8], "apach": 7, "simpler": [7, 8, 9], "700m": 7, "100m": 7, "gemma": [7, 9], "grown": 7, "withdraw": 7, "incomplet": [7, 8], "unclear": 7, "15t": 7, "8t": 7, "fineweb": 7, "penedo": 7, "96": [7, 8], "crawl": 7, "snapshot": 7, "ablat": 7, "vital": [7, 8], "favorit": 7, "spawn": 7, "ultrachat": 7, "2024u": 7, "created_job": 7, "fine_tun": 7, "training_fil": 7, "file_id": 7, "ultrachat_chunk_train": 7, "validation_fil": 7, "ultrachat_chunk_ev": 7, "training_step": 7, "0001": 7, "auto_start": 7, "job_id": 7, "toolkit": [7, 8], "sft": 7, "nemo": [7, 8], "codestr": 7, "2024v": 7, "enough": 7, "rewrit": 7, "smolvlm": 7, "mlx": [7, 9], "mlc": 7, "peft": 7, "programm": 7, "graphic": [7, 8], "vram": 7, "mathbf": 7, "x_1": [7, 9], "x_2": [7, 9], "x_n": [7, 9], "x_": [7, 9], "\u03b8": 7, "matrix": [7, 8], "cerebra": 7, "mozilla": 7, "gerganov": 7, "georgi": 7, "hundr": 7, "overwhelm": [7, 9], "manifesto": 7, "enjoy": 7, "bog": 7, "exploratori": 7, "hacker": 7, "Will": [7, 8], "prototyp": 7, "prematur": 7, "besid": 7, "lighter": 7, "sacrific": 7, "ggml": [7, 9], "disk": 7, "backward": 7, "2024x": 7, "repo": 7, "compil": 7, "linux": 7, "argument": [7, 8, 9], "sudo": 7, "apt": 7, "cmake": 7, "bind": 7, "betlen": 7, "cnv": 7, "llamacpp": 7, "succinct": 7, "ctrl": 7, "interject": 7, "philosoph": 7, "debat": 7, "fulfil": 7, "happi": 7, "responsibli": 7, "bye": 7, "goodby": 7, "port": 7, "127": 7, "curl": [7, 9], "localhost": 7, "bearer": 7, "finish_reason": 7, "deepli": 7, "1734627879": 7, "completion_token": 7, "total_token": 7, "chatcmpl": 7, "5wl2tzjzdmzupvxwp2gcedr8xbpsyhfm": 7, "prompt_n": 7, "prompt_m": 7, "132": 7, "prompt_per_token_m": 7, "prompt_per_second": 7, "77619878666999": 7, "predicted_n": 7, "predicted_m": 7, "1700": 7, "654": [7, 9], "predicted_per_token_m": 7, "36882142857143": 7, "predicted_per_second": 7, "92850867960208": 7, "gbnf": [7, 9], "8pm": 7, "appointmenttim": 7, "appointmentdetail": 7, "handi": 7, "model_path": 7, "llama_cpp": 7, "create_chat_complet": 7, "occupi": 7, "activist": 7, "justin": [7, 8], "tunnei": 7, "ocho": 7, "appach": 7, "cosmopolitan": 7, "libc": 7, "portabl": 7, "durabl": 7, "usabl": [7, 8, 9], "tinyllama": 7, "wget": 7, "jartin": 7, "q5_k_m": 7, "renam": 7, "ex": 7, "chmod": 7, "nobrows": 7, "registri": 7, "nativ": [7, 9], "trai": 7, "familiar": 7, "bare": 7, "ssfl": 7, "sh": [7, 9], "Or": 7, "11434": 7, "chatrespons": 7, "easiest": 7, "rich": [7, 8], "playground": 7, "importantli": [7, 9], "intuit": 7, "beginn": 7, "tensorrt": 7, "trt": 7, "latex": 7, "voic": 7, "pwa": 7, "medium": [7, 8, 9], "gpt4all": 7, "rbac": 7, "q4_k": 7, "q6_k": 7, "mib": 7, "wikitext": 7, "salesforc": 7, "wikipedia": [7, 9], "min_prompt_length": 7, "input_texts_raw": 7, "2010": 7, "valkyria": 7, "chronicl": 7, "forgiv": 7, "newcom": 7, "raita": 7, "honjou": 7, "hitoshi": 7, "sakimoto": 7, "takeshi": 7, "ozawa": 7, "writer": 7, "sung": 7, "escap": 7, "escaped_text": 7, "block_scal": 7, "block": [7, 8], "parenthes": 7, "block_min": 7, "formula": 7, "superblock": 7, "5625": 7, "ieee": 7, "754": 7, "ppl": 7, "exp": 7, "sum_": 7, "log_2": 7, "x_i": [7, 9], "avg": 7, "_i": 7, "corr": 7, "ln": [7, 9], "kullback": 7, "leibler": 7, "entropi": 7, "logit": 7, "d_": 7, "softmax": [7, 9], "sum": 7, "kld": 7, "q2_kresult": 7, "q6": 7, "004": 7, "q2": 7, "112": 7, "q4": 7, "smallest": 7, "390": 7, "67": [7, 8], "81": [7, 8], "462": 7, "614": 7, "170": 7, "q4_k_m": 7, "thread": 7, "16x": 7, "85x": 7, "79x": 7, "ubuntu": 7, "lt": 7, "x86_64": 7, "gnu": 7, "thank": [7, 9], "intel": 7, "i7": 7, "8550u": 7, "15gib": 7, "samsung": 7, "ssd": 7, "970": 7, "evo": 7, "500gb": 7, "1170": 7, "meant": 7, "ai4c": 7, "ai4a": 7, "paperswithcod": [7, 8], "ana24a": 7, "leaderboard": [7, 8], "artificialanalysi": 7, "ana24b": 7, "ana24c": 7, "bc24": 7, "andrei": [7, 8], "abetlen": 7, "dee24": 7, "blob": [7, 9], "deepseek_v3": 7, "fac4": 7, "optimum": 7, "concept_guid": 7, "fac4t": 7, "fac4u": 7, "200k": 7, "ultrachat_200k": 7, "fac4v": 7, "blogpost": 7, "gc24": 7, "ggerganov": [7, 9], "readm": [7, 9], "gc4a": 7, "gc4b": 7, "pka": 7, "guilherm": 7, "hynek": 7, "kydl\u00ed\u010dek": 7, "decant": 7, "finest": 7, "17557": 7, "qwe4b": 7, "qy": 7, "beichen": 7, "tingyu": 7, "zihan": 7, "qiu": 7, "15115": 7, "rev24": 7, "nyt": 7, "harvardlawreview": 7, "timess": 7, "zwa": 7, "wael": 7, "geoffrei": [7, 8], "angu": 7, "arnav": 7, "jefferi": 7, "kinnison": 7, "sherstinski": 7, "piero": 7, "molino": 7, "travi": 7, "addair": 7, "devvret": 7, "310": 7, "2405": 7, "00732": 7, "huggingface4xa": 7, "huggingface4xb": 7, "ibmthink24": 7, "lmstudio24": 7, "lmstudio": 7, "metaai4c": 7, "mozillaocho24": 7, "salesforce24": 7, "immens": 8, "commonplac": 8, "hartvigsen": 8, "societi": 8, "alarm": 8, "openli": 8, "dolli": 8, "llama2": [8, 9], "emb": 8, "generalist": 8, "injustic": 8, "inequ": 8, "undermin": 8, "perpetu": 8, "displac": 8, "eros": 8, "fake": 8, "deepfak": 8, "distrust": 8, "cyberattack": 8, "spread": 8, "disinform": 8, "inadvert": 8, "interven": 8, "irrevers": 8, "uncheck": 8, "extinct": 8, "race": 8, "incentiv": 8, "shortcut": 8, "stress": 8, "urgent": 8, "reorient": 8, "siam": 8, "edgington": 8, "jailbreak": 8, "promptcraft": 8, "stealth": 8, "sutton": 8, "subtl": 8, "subtleti": 8, "exception": 8, "phrase": 8, "evad": 8, "hqve": 8, "frer": 8, "hplidai": 8, "pl": 8, "hyperion": 8, "coast": 8, "redwood": 8, "tallest": 8, "tree": [8, 9], "routin": 8, "prejudic": 8, "gallego": 8, "leak": 8, "poison": 8, "intention": 8, "inject": 8, "mislead": 8, "exabeam": 8, "finra": 8, "3110": 8, "mandat": 8, "supervisori": 8, "unicef": 8, "empow": 8, "contest": 8, "congress": 8, "enact": 8, "pictur": [8, 9], "sound": 8, "territori": 8, "oversea": 8, "chines": 8, "legitim": 8, "consent": 8, "complaint": 8, "cooper": 8, "extraterritori": 8, "offshor": 8, "draft": 8, "voluntari": 8, "player": 8, "prepared": 8, "compris": 8, "cbrn": 8, "persuas": 8, "autonomi": 8, "gradat": 8, "scorecard": 8, "elig": 8, "advisori": 8, "sag": 8, "shut": 8, "prerequisit": 8, "harden": 8, "asl": 8, "biosafeti": 8, "elev": 8, "warn": [8, 9], "bioweapon": 8, "compartment": 8, "4x": 8, "jump": 8, "paus": 8, "deepmind": 8, "biosecur": 8, "buffer": 8, "formul": [8, 9], "calibr": 8, "promin": 8, "taxonomi": 8, "llamaguard": 8, "20241022": 8, "5x": 8, "alaga": 8, "substandard": 8, "oxford": 8, "wachter": 8, "blur": 8, "ill": 8, "stifl": 8, "suscept": 8, "aadc": 8, "outset": 8, "curricula": 8, "adversari": 8, "thoroughli": 8, "lm": [8, 9], "undergo": 8, "280b": 8, "cai": [8, 9], "utilis": 8, "minimis": 8, "enshrin": 8, "evas": 8, "resort": 8, "avenu": 8, "cambria": 8, "inherit": 8, "influenti": 8, "debias": 8, "plausibl": 8, "occurr": 8, "phish": 8, "clarifi": 8, "toler": 8, "checklist": 8, "abus": 8, "ux": 8, "architect": 8, "retrofit": 8, "promptli": 8, "dashboard": 8, "misalign": 8, "star": 8, "postpon": 8, "combat": 8, "counter": 8, "traffic": 8, "frustrat": 8, "workaround": 8, "silo": 8, "hierarchi": 8, "mcq": 8, "regex": [8, 9], "joint": 8, "facet": 8, "purpl": 8, "opensafetylab": 8, "salad_bench_dataset": 8, "base_set": 8, "gptfuzzer": 8, "auto": [8, 9], "qid": 8, "o1": 8, "supremaci": 8, "o53": 8, "o14": 8, "o5": 8, "o65": 8, "plagiar": 8, "o16": 8, "o6": 8, "o47": 8, "campaign": 8, "o12": 8, "o52": 8, "surveil": 8, "spous": 8, "know": [8, 9], "o13": 8, "ncount": 8, "21318": 8, "8756": 8, "6486": 8, "o2": 8, "1717": 8, "o4": 8, "1477": 8, "o3": 8, "socioeconom": 8, "851": 8, "int64": 8, "gen": 8, "15433": 8, "hh": 8, "4184": 8, "659": 8, "advbench": 8, "230": 8, "189": 8, "toxicchat": 8, "anyth": 8, "misconcept": 8, "ingrain": 8, "mc1": 8, "singular": 8, "choices4": 8, "mc2": 8, "set4": 8, "scorer": 8, "correctli": [8, 9], "truthful_qa": 8, "truthfulqa_dataset": 8, "multiple_choic": 8, "best_answ": 8, "correct_answ": 8, "incorrect_answ": 8, "watermelon": 8, "digest": 8, "noth": 8, "stomach": 8, "sick": 8, "wonderopoli": 8, "wonder": 8, "belli": 8, "swallow": 8, "dream": 8, "die": 8, "indigest": 8, "unconsci": 8, "excret": 8, "asr": 8, "r2d2": 8, "wider": [8, 9], "mass": 8, "destruct": 8, "asynchron": 8, "webpurifi": 8, "protectai": 8, "comprehend": 8, "amazon": 8, "nvidia": [8, 9], "keyword": 8, "toolset": 8, "nemmo": 8, "synchron": 8, "nemoguardrail": 8, "llmrail": 8, "railsconfig": 8, "from_path": 8, "rail": 8, "hello": 8, "ministr": 8, "mistralai": 8, "mistral_api_kei": 8, "moderate_chat": 8, "omni": 8, "pprint": 8, "to_json": 8, "threaten": 8, "illicit": 8, "granit": 8, "guardian": 8, "consortium": 8, "11b": 8, "begin_of_text": 8, "start_header_id": 8, "end_header_id": 8, "unsafe_categori": 8, "user_message_1": 8, "model_answer_1": 8, "comma": 8, "eot_id": 8, "eom_id": 8, "denot": 8, "s1": 8, "s2": 8, "s3": 8, "s4": 8, "s5": 8, "defam": 8, "s6": 8, "s7": 8, "s8": 8, "s9": 8, "s10": 8, "s11": 8, "s12": 8, "s13": 8, "atla": 8, "2b": 8, "hap": 8, "38m": 8, "125m": 8, "padhi": 8, "shieldgemma": 8, "judge_prompt": 8, "american": 8, "vandal": 8, "underag": 8, "drink": 8, "vulgar": 8, "obscen": 8, "racism": 8, "derogatori": 8, "firearm": 8, "safety_scor": 8, "IN": 8, "borderlin": 8, "verdict": 8, "boolean": [8, 9], "brief": 8, "rational": 8, "delimit": 8, "paramount": [8, 9], "evenli": 8, "good_sampl": 8, "bad_sampl": 8, "2024z": 8, "surg": 8, "scam": 8, "get_profanity_sampl": 8, "show_stat": 8, "current_dir": 8, "getcwd": 8, "data_path": 8, "profanity_en": 8, "random_st": 8, "ncategori": 8, "category_count": 8, "category_1": 8, "1f": 8, "profanity_sampl": 8, "nsampl": 8, "anatomi": 8, "slur": 8, "182": 8, "bodili": 8, "fluid": 8, "excrement": 8, "insult": 8, "mental": 8, "disabl": 8, "jap": 8, "babi": [8, 9], "batter": 8, "crazi": 8, "sob": 8, "fukka": 8, "sh1t3": 8, "get_salad_sampl": 8, "salad_sampl": 8, "tortur": 8, "porn": 8, "sin": 8, "sight": 8, "god": 8, "embezzl": 8, "xanax": 8, "alcohol": 8, "get_good_sampl": 8, "min_scor": 8, "reichstag": 8, "profanity_data": 8, "salad_data": 8, "good_data": 8, "all_data": 8, "prompt_sampl": 8, "is_unsaf": 8, "counti": 8, "holli": 8, "ridg": 8, "nc": 8, "town": 8, "onslow": 8, "carolina": 8, "diver": 8, "underwat": 8, "maze": 8, "coral": 8, "treasur": 8, "vivid": 8, "sensori": 8, "emot": 8, "labyrinthin": 8, "reef": 8, "suspens": 8, "obstacl": 8, "creatur": 8, "nomin": 8, "nobel": 8, "literatur": 8, "love": 8, "logo": 8, "thief": 8, "rob": 8, "famou": 8, "nstatist": 8, "source_stat": 8, "type_stat": 8, "plug": 8, "safetyvalid": 8, "validationresult": 8, "dataclass": 8, "abstractmethod": 8, "llmguardvalid": 8, "scanner": 8, "bantop": 8, "llm_guard": 8, "input_scann": 8, "scan_prompt": 8, "matchtyp": 8, "default_banned_top": 8, "banned_top": 8, "super": 8, "banned_topics_scann": 8, "use_onnx": 8, "toxicity_scann": 8, "match_typ": 8, "fail_fast": 8, "unsafe_scann": 8, "gun": 8, "cool": 8, "hunt": 8, "deer": 8, "dad": 8, "mistralvalid": 8, "hate_and_discrimin": 8, "violence_and_threat": 8, "dangerous_and_criminal_cont": 8, "selfharm": 8, "openaivalid": 8, "attr": 8, "dir": 8, "getattr": 8, "illicit_viol": 8, "llmjudgevalid": 8, "prompt_path": 8, "llmjudg": 8, "filenotfounderror": 8, "elaps": 8, "score_valid": 8, "id_": 8, "validator_inst": 8, "validation_result": 8, "elapsed_tim": 8, "prompt_sample_id": 8, "validator_nam": 8, "scoring_prompt": 8, "scoring_result": 8, "01536": 8, "34098": 8, "497136": 8, "546416": 8, "calculate_validator_metr": 8, "bad_sourc": 8, "good_sourc": 8, "tpr": 8, "fpr": 8, "f1_score": 8, "tn": 8, "fp": 8, "fn": 8, "tp": 8, "255": 8, "74": 8, "238": 8, "130": 8, "256": 8, "149": 8, "213": 8, "190": 8, "235": 8, "63": 8, "222": 8, "480": 8, "157": 8, "487": 8, "495": 8, "482": 8, "667": 8, "248": 8, "466": 8, "143": 8, "355": 8, "slowest": 8, "room": 8, "false_posit": 8, "left_on": 8, "right_on": 8, "persian": 8, "teenag": 8, "\u0686\u0627\u0644\u0634": 8, "\u0645\u0648\u0645\u0648": 8, "\u06cc\u06a9": 8, "\u062d\u0642\u0647": 8, "\u0648": 8, "\u0627\u0641\u0633\u0627\u0646\u0647": 8, "\u0645\u062d\u0644\u06cc": 8, "\u0627\u0633\u062a": 8, "\u06a9\u0647": 8, "\u0634\u0627\u06cc\u0639\u0647": 8, "\u0622\u0646": 8, "\u062f\u0631": 8, "\u0634\u0628\u06a9\u0647": 8, "\u0647\u0627\u06cc": 8, "\u0627\u062c\u062a\u0645\u0627\u0639\u06cc": 8, "\u067e\u062e\u0634": 8, "\u0634\u062f": 8, "\u06af\u0632\u0627\u0631\u0634": 8, "\u06a9\u0627\u0631\u0628\u0631\u06cc": 8, "\u0628\u0627": 8, "\u0646\u0627\u0645": 8, "\u06a9\u0648\u062f\u06a9\u0627\u0646": 8, "\u0646\u0648\u062c\u0648\u0627\u0646\u0627\u0646": 8, "\u0631\u0627": 8, "\u0628\u0647": 8, "\u0627\u0639\u0645\u0627\u0644": 8, "\u062e\u0637\u0631\u0646\u0627\u06a9\u06cc": 8, "\u0645\u0627\u0646\u0646\u062f": 8, "\u062e\u0648\u062f\u0632\u0646\u06cc": 8, "\u062e\u0648\u062f\u06a9\u0634\u06cc": 8, "\u062a\u0631\u063a\u06cc\u0628": 8, "\u0645\u06cc": 8, "\u06a9\u0646\u062f": 8, "\u0648\u062c\u0648\u062f": 8, "\u0635\u062d\u0628\u062a": 8, "\u0632\u06cc\u0627\u062f\u06cc": 8, "\u0645\u0648\u0631\u062f": 8, "\u0627\u06cc\u0646": 8, "\u062a\u0639\u062f\u0627\u062f": 8, "\u0634\u06a9\u0627\u06cc\u0627\u062a": 8, "\u0648\u0627\u0642\u0639\u06cc": 8, "\u0628\u0633\u06cc\u0627\u0631": 8, "\u06a9\u0645": 8, "\u0628\u0648\u062f": 8, "\u067e\u0644\u06cc\u0633": 8, "\u0635\u062f\u0645\u0647": 8, "\u062f\u06cc\u062f\u0646": 8, "\u062f\u0644\u06cc\u0644": 8, "\u062a\u0623\u062b\u06cc\u0631": 8, "\u0645\u0633\u062a\u0642\u06cc\u0645": 8, "\u067e\u062f\u06cc\u062f\u0647": 8, "\u062a\u0623\u06cc\u06cc\u062f": 8, "\u0646\u06a9\u0631\u062f\u0647": 8, "\u062a\u0631\u0633": 8, "\u0646\u06af\u0631\u0627\u0646\u06cc": 8, "\u0627\u06cc\u062c\u0627\u062f": 8, "\u0634\u062f\u0647": 8, "\u0628\u06cc\u0634\u062a\u0631": 8, "\u0627\u0632": 8, "\u062e\u0648\u062f": 8, "\u0631\u0633\u0627\u0646\u0647": 8, "\u0647\u0627": 8, "\u0637\u0648\u0631\u06cc": 8, "\u062e\u06cc\u0631\u06cc\u0647": 8, "\u0647\u0634\u062f\u0627\u0631": 8, "\u062f\u0627\u062f\u0646\u062f": 8, "\u0622\u0633\u06cc\u0628": 8, "\u0627\u0646\u062a\u0638\u0627\u0631\u0627\u062a": 8, "\u0645\u062d\u062a\u0648\u0627\u06cc": 8, "\u062e\u0634\u0648\u0646\u062a": 8, "\u0622\u0645\u06cc\u0632": 8, "\u0627\u06cc\u0646\u062a\u0631\u0646\u062a": 8, "\u06af\u0641\u062a\u0647": 8, "\u0634\u0648\u062f": 8, "\u0627\u0648\u0644\u06cc\u0646": 8, "\u0628\u0627\u0631": 8, "\u0633\u0627\u0644": 8, "\u06f2\u06f0\u06f1\u06f8": 8, "\u067e\u0633": 8, "\u0622\u0646\u06a9\u0647": 8, "\u0631\u0648\u0632\u0646\u0627\u0645\u0647": 8, "\u0627\u0646\u062f\u0648\u0646\u0632\u06cc\u0627\u06cc\u06cc": 8, "\u062e\u0628\u0631": 8, "\u062f\u062e\u062a\u0631": 8, "\u06f1\u06f2": 8, "\u0633\u0627\u0644\u0647": 8, "\u062f\u0627\u062f": 8, "\u0645\u0648\u0636\u0648\u0639": 8, "\u062c\u0647\u0627\u0646\u06cc": 8, "\u062a\u0628\u062f\u06cc\u0644": 8, "\u0645\u062c\u0633\u0645\u0647": 8, "\u0647\u0646\u0631\u0645\u0646\u062f": 8, "\u0698\u0627\u067e\u0646\u06cc": 8, "\u0647\u0631": 8, "\u0686\u0646\u062f": 8, "\u0634\u0627\u06cc\u062f": 8, "\u0646\u06af\u0627\u0647": 8, "\u0628\u0639\u0636\u06cc": 8, "\u0632\u06cc\u0628\u0627": 8, "\u0646\u0628\u0627\u0634\u062f": 8, "\u0627\u0645\u0627": 8, "\u06a9\u0627\u0645\u0644\u0627": 8, "\u0628\u06cc": 8, "\u062e\u0637\u0631": 8, "\u0627\u06cc\u0631\u0627\u0646": 8, "\u0645\u062f\u062a": 8, "\u0628\u06cc\u0646": 8, "\u06a9\u0627\u0631\u0628\u0631\u0627\u0646": 8, "\u0645\u0637\u0631\u062d": 8, "\u0633\u0627\u0644\u06cc": 8, "\u0633\u0631\u0627\u0633\u0631": 8, "\u062c\u0647\u0627\u0646": 8, "\u0645\u0634\u0627\u0628\u0647\u06cc": 8, "\u0628\u0631\u0627\u06cc": 8, "\u0648\u0627\u0644\u062f\u06cc\u0646": 8, "\u06a9\u0631\u062f\u0647": 8, "\u0627\u0641\u0631\u0627\u062f": 8, "\u0686\u0647": 8, "\u06a9\u0627\u0631\u06cc": 8, "\u062f\u0639\u0648\u062a": 8, "tourist": 8, "distress": 8, "polish": 8, "galician": 8, "dzisiaj": 8, "szwecji": 8, "innych": 8, "bogatych": 8, "krajach": 8, "ludzi": 8, "u\u017cywaj\u0105": 8, "mn\u00f3stwo": 8, "najr\u00f3\u017cniejszych": 8, "urz\u0105dze\u0144": 8, "hox": 8, "suecia": 8, "outro": 8, "pa\u00eds": 8, "rico": 8, "xent": 8, "moita": 8, "m\u00e1quina": 8, "diferent": 8, "\u0142\u00f3dka": 8, "zaczyna": 8, "ton\u0105\u0107": 8, "tury\u015bci": 8, "wracaj\u0105": 8, "statek": 8, "dom\u00f3w": 8, "gdzie": 8, "opowiadaj\u0105": 8, "tym": 8, "jak": 8, "zostali": 8, "zaatakowani": 8, "surprisingli": 8, "unsettl": 8, "paradox": 8, "harbor": 8, "wisdom": 8, "aspir": 8, "technologist": 8, "disciplinari": 8, "ethicist": 8, "policymak": 8, "asa24": 8, "jide": 8, "jona": 8, "schuett": 8, "marku": 8, "anderljung": 8, "08751": 8, "bhy": 8, "hinton": 8, "pieter": 8, "abbeel": 8, "trevor": 8, "darrel": 8, "yuval": 8, "harari": 8, "ya": 8, "lan": 8, "shai": 8, "shalev": 8, "gillian": 8, "hadfield": 8, "clune": 8, "tegan": 8, "maharaj": 8, "hutter": 8, "at\u0131l\u0131m": 8, "g\u00fcne\u015f": 8, "baydin": 8, "sheila": 8, "mcilraith": 8, "qiqi": 8, "ashwin": 8, "acharya": 8, "anca": 8, "dragan": 8, "philip": 8, "torr": 8, "kahneman": 8, "s\u00f6ren": 8, "mindermann": 8, "amid": 8, "384": 8, "6698": 8, "1126": 8, "adn0117": 8, "bbc": 8, "emili": 8, "braca": 8, "israel": 8, "carter": 8, "hafsa": 8, "kanchwala": 8, "khojasteh": 8, "charli": 8, "landow": 8, "luo": 8, "magarelli": 8, "mirin": 8, "averi": 8, "moyer": 8, "kayla": 8, "simpson": 8, "amelia": 8, "skawinski": 8, "heverin": 8, "23308": 8, "bmc": 8, "dillon": 8, "brendan": 8, "murphi": 8, "khachaturov": 8, "gleav": 8, "kellin": 8, "pelrin": 8, "2408": [8, 9], "02946": 8, "cmm": 8, "erik": 8, "lorenzo": 8, "malandri": 8, "mercorio": 8, "navid": 8, "nobani": 8, "seveso": 8, "15248": 8, "edg24": 8, "exa24": 8, "cyber": 8, "grb": 8, "rossi": 8, "barrow": 8, "mehrab": 8, "tanjim": 8, "sungchul": 8, "franck": 8, "dernoncourt": 8, "ruiyi": 8, "nesreen": 8, "2309": 8, "00770": 8, "h44z": 8, "hgp": 8, "saadia": 8, "hamid": 8, "palangi": 8, "dipankar": 8, "ec": 8, "kamar": 8, "oxi": 8, "smaranda": 8, "muresan": 8, "preslav": 8, "nakov": 8, "alin": 8, "villavicencio": 8, "60th": 8, "3309": 8, "3326": 8, "dublin": 8, "hym": 8, "weijiang": 8, "weitao": 8, "weihong": 8, "zhangyin": 8, "haotian": 8, "qianglong": 8, "weihua": 8, "xiaocheng": 8, "bing": 8, "dx": 8, "3703155": 8, "ldw": 8, "lijun": 8, "ruohui": 8, "xuhao": 8, "wangmeng": 8, "zuo": 8, "dahua": 8, "qiao": 8, "shao": 8, "05044": 8, "mpy": 8, "xuwang": 8, "zifan": 8, "norman": 8, "mu": 8, "elham": 8, "sakhae": 8, "nathaniel": 8, "forsyth": 8, "04249": 8, "ma24": 8, "mlc24": 8, "illumin": 8, "ailumin": 8, "oaa": 8, "adler": 8, "ahmad": 8, "ilg": 8, "akkaya": 8, "florencia": 8, "leoni": 8, "aleman": 8, "janko": 8, "altenschmidt": 8, "altman": 8, "shyamal": 8, "anadkat": 8, "avila": 8, "valeri": 8, "balcom": 8, "baltescu": 8, "haim": 8, "belgum": 8, "irwan": 8, "bello": 8, "jake": 8, "berdin": 8, "bernadett": 8, "shapiro": 8, "berner": 8, "lenni": 8, "bogdonoff": 8, "boiko": 8, "madelain": 8, "boyd": 8, "luisa": 8, "brakman": 8, "button": 8, "rosi": 8, "campbel": 8, "cann": 8, "brittani": 8, "carei": 8, "carlson": 8, "rori": 8, "carmichael": 8, "che": 8, "foti": 8, "sulli": 8, "rubi": 8, "chess": 8, "chester": 8, "cho": 8, "hyung": 8, "won": 8, "chung": 8, "jeremiah": 8, "currier": 8, "yunx": 8, "cori": 8, "decareaux": 8, "degri": 8, "deutsch": 8, "devil": 8, "dhar": 8, "dowl": 8, "dun": 8, "adrien": 8, "ecoffet": 8, "atti": 8, "eleti": 8, "tyna": 8, "elound": 8, "farhi": 8, "niko": 8, "sim\u00f3n": 8, "posada": 8, "fishman": 8, "juston": 8, "isabella": 8, "fulford": 8, "georg": 8, "gibson": 8, "vik": 8, "tarun": 8, "gogineni": 8, "goh": 8, "rapha": 8, "gontijo": 8, "lope": 8, "gordon": 8, "morgan": 8, "grafstein": 8, "yufei": 8, "hallaci": 8, "heaton": 8, "johann": 8, "heideck": 8, "hickei": 8, "wade": 8, "hoeschel": 8, "houghton": 8, "kenni": 8, "hsu": 8, "shengli": 8, "joost": 8, "huizinga": 8, "shawn": 8, "joann": 8, "jang": 8, "roger": 8, "haozhun": 8, "shino": 8, "jomoto": 8, "billi": 8, "jonn": 8, "tomer": 8, "kaftan": 8, "\u0142ukasz": 8, "kamali": 8, "ingmar": 8, "kanitscheid": 8, "tabarak": 8, "khan": 8, "logan": 8, "kilpatrick": 8, "jong": 8, "wook": 8, "christina": 8, "yongjik": 8, "hendrik": 8, "kirchner": 8, "kiro": 8, "matt": 8, "kokotajlo": 8, "kondraciuk": 8, "kondrich": 8, "konstantinidi": 8, "kosic": 8, "vishal": 8, "kuo": 8, "lamp": 8, "ikai": 8, "teddi": 8, "jade": 8, "leung": 8, "chak": 8, "lim": 8, "molli": 8, "mateusz": 8, "litwin": 8, "theresa": 8, "lopez": 8, "patricia": 8, "lue": 8, "makanju": 8, "malfacini": 8, "markov": 8, "yaniv": 8, "markovski": 8, "bianca": 8, "mayn": 8, "mckinnei": 8, "christin": 8, "mcleavei": 8, "mcmillan": 8, "mcneil": 8, "aalok": 8, "menick": 8, "mishchenko": 8, "vinni": 8, "monaco": 8, "murk": 8, "m\u00e9ly": 8, "ashvin": 8, "nair": 8, "reiichiro": 8, "nakano": 8, "rajeev": 8, "nayak": 8, "arvind": 8, "neelakantan": 8, "hyeonwoo": 8, "noh": 8, "keef": 8, "jakub": 8, "pachocki": 8, "palermo": 8, "ashlei": 8, "pantuliano": 8, "parish": 8, "emi": 8, "parparita": 8, "passo": 8, "perelman": 8, "belbut": 8, "pere": 8, "pokorni": 8, "pokrass": 8, "vitchyr": 8, "pong": 8, "tolli": 8, "powel": 8, "bori": 8, "proehl": 8, "rae": 8, "ramesh": 8, "franci": 8, "kendra": 8, "rimbach": 8, "carl": 8, "rotst": 8, "roussez": 8, "saltarelli": 8, "ted": 8, "sander": 8, "schnurr": 8, "selsam": 8, "kyla": 8, "sheppard": 8, "toki": 8, "sherbakov": 8, "shieh": 8, "shoker": 8, "pranav": 8, "szymon": 8, "sidor": 8, "sigler": 8, "sitkin": 8, "sokolowski": 8, "natali": 8, "staudach": 8, "madelein": 8, "tootoonchian": 8, "tseng": 8, "preston": 8, "tuggl": 8, "turlei": 8, "juan": 8, "cer\u00f3n": 8, "urib": 8, "vallon": 8, "vijayvergiya": 8, "jai": 8, "alvin": 8, "ward": 8, "cj": 8, "weinmann": 8, "akila": 8, "welihinda": 8, "jiayi": 8, "weng": 8, "lilian": 8, "wiethoff": 8, "willner": 8, "wolrich": 8, "lauren": 8, "workman": 8, "sherwin": 8, "yoo": 8, "zeller": 8, "shengjia": 8, "juntang": 8, "zhuk": 8, "2303": 8, "08774": 8, "pnc": 8, "inkit": 8, "manish": 8, "nagireddi": 8, "giandomenico": 8, "cornacchia": 8, "subhajit": 8, "chaudhuri": 8, "tejaswini": 8, "pedapati": 8, "pierr": 8, "dognin": 8, "keerthiram": 8, "murugesan": 8, "miehl": 8, "santill\u00e1n": 8, "kieran": 8, "giulio": 8, "zizzo": 8, "muhammad": 8, "zaid": 8, "hame": 8, "purcel": 8, "desmond": 8, "ing": 8, "vejsbjerg": 8, "dali": 8, "hind": 8, "werner": 8, "geyer": 8, "ambrish": 8, "rawat": 8, "kush": 8, "varshnei": 8, "prasanna": 8, "sattigeri": 8, "07724": 8, "saffron": 8, "ring": 8, "aslanid": 8, "glaes": 8, "nat": 8, "mcalees": 8, "irv": 8, "2202": 8, "03286": 8, "szw": 8, "qinghua": 8, "higham": 8, "gorban": 8, "bastouni": 8, "ivan": 8, "tyukin": 8, "12670": 8, "vsk": 8, "simplesafetytest": 8, "2311": 8, "08370": 8, "wmr24": 8, "sandra": 8, "brent": 8, "mittelstadt": 8, "duti": 8, "royal": 8, "240197": 8, "royalsocietypublish": 8, "1098": 8, "rso": 8, "ylx24": 8, "jiahao": 8, "xingwei": 8, "zyi": 8, "shune": 8, "lyumanshan": 8, "jingyu": 8, "shui": 8, "haobin": 8, "pengfei": 8, "hewu": 8, "ghost": 8, "14931": 8, "zho24": 8, "amazonwservices24": 8, "cdn": 8, "1adf000c8f675958c2ee23805d91aaade1cd4613": 8, "centerfasafety24a": 8, "centerforaisafeti": 8, "centerfasafety24b": 8, "deepmind24": 8, "googleapi": 8, "fsf": 8, "europeanmagency24": 8, "ema": 8, "europa": 8, "activities_en": 8, "financialirauthority24": 8, "ibm24": 8, "watsonx": 8, "saa": 8, "libraryocongress23": 8, "loc": 8, "mistralai24": 8, "mlsteam24": 8, "mlsafeti": 8, "nationaliosatechnology24": 8, "nist": 8, "itl": 8, "nvidia24": 8, "openai24a": 8, "openai24b": 8, "opensafetylab24a": 8, "opensafetylab24b": 8, "protectai24": 8, "surgeai24": 8, "ukgovernment24": 8, "unicef24": 8, "innocenti": 8, "julia": 9, "easili": 9, "trial": 9, "wrangl": 9, "hoc": 9, "unwant": 9, "overflow": 9, "twitter": 9, "youtub": 9, "ldot": 9, "prod_": 9, "syntact": 9, "central": 9, "delic": 9, "heart": 9, "xml": 9, "invalid": 9, "ttt": 9, "itt": 9, "nousresearch": 9, "herm": 9, "json_format": 9, "person1": 9, "q1": 9, "person2": 9, "response_cont": 9, "is_json": 9, "myjson": 9, "nest": 9, "conceptu": 9, "unend": 9, "whitespac": 9, "throw": 9, "somewher": 9, "json_object": 9, "circul": 9, "vertex": 9, "worri": 9, "enum": 9, "secextract": 9, "mentioned_ent": 9, "mentioned_plac": 9, "extract_from_sec_fil": 9, "sec_filing_text": 9, "hint": 9, "prompt_extract": 9, "sec_extract": 9, "washington": 9, "beg": 9, "1652": 9, "171": 9, "unnorm": 9, "0325": 9, "strongest": 9, "bfloat16": 9, "device_map": 9, "src": 9, "python3": 9, "nvml": 9, "return_tensor": 9, "pt": 9, "inference_mod": 9, "last_token_logit": 9, "next_token_prob": 9, "nn": 9, "dim": 9, "top_k_prob": 9, "top_k_indic": 9, "topk": 9, "top_k_token": 9, "decod": 9, "idx": 9, "skip_special_token": 9, "prob": 9, "0305": 9, "0197": 9, "0106": 9, "0093": 9, "logitsprocessor": 9, "logits_processor": 9, "logitsprocessorlist": 9, "customlogitsprocessor": 9, "intermediari": 9, "input_id": 9, "__call__": 9, "longtensor": 9, "batch_siz": 9, "sequence_length": 9, "floattensor": 9, "vocab_s": 9, "mask": 9, "pick": 9, "greedi": 9, "yesnologitsprocessor": 9, "initial_length": 9, "fill_": 9, "inf": 9, "debug": 9, "yes_token": 9, "add_special_token": 9, "no_token": 9, "yes_no_logit": 9, "yes_no_prob": 9, "yes_prob": 9, "no_prob": 9, "yes_mask": 9, "1e4": 9, "NO": 9, "generation_output_control": 9, "uncontrol": 9, "generation_output": 9, "renorm": 9, "4263": 9, "5737": 9, "10407": 9, "4607": 9, "6250": 9, "9219": 9, "helper": 9, "model_output": 9, "gen_output": 9, "batch_decod": 9, "clean_up_tokenization_spac": 9, "classic": 9, "italian": 9, "willard": 9, "louf": 9, "reformul": 9, "finit": 9, "fsm": 9, "s_": 9, "s_t": 9, "s_1": 9, "tild": 9, "odot": 9, "rightarrow": 9, "wise": 9, "thien": 9, "automaton": 9, "dfa": 9, "outgo": 9, "yy": 9, "ever": 9, "aa": 9, "lwai": 9, "prop": 9, "yynnaa": 9, "malform": 9, "sec_extraction_outlin": 9, "zsp": 9, "zicorp": 9, "with_structured_output": 9, "runnabl": 9, "typeddict": 9, "qu": 9, "langchain_openai": 9, "chatopenai": 9, "chatprompttempl": 9, "extract_from_sec_filing_langchain": 9, "structured_llm": 9, "from_messag": 9, "sec_extraction_langchain": 9, "bnf": 9, "backu": 9, "naur": 9, "fssl": 9, "extract_entities_from_sec_fil": 9, "ollama_structured_output_prompt_suffix": 9, "ollama_structured_output_temperatur": 9, "uncensor": 9, "model_json_schema": 9, "response_json": 9, "sharpli": 9, "exllama2": 9, "zoo": 9, "furthermor": 9, "nonetheless": 9, "extran": 9, "dispar": 9, "preval": 9, "speak": 9, "aider": 9, "outweigh": 9, "rebutt": 9, "reproduct": 9, "paint": 9, "dottxt": 9, "flaw": 9, "uneven": 9, "conflat": 9, "drawback": 9, "pfiffer": 9, "wrestl": 9, "aid24": 9, "dot24": 9, "demo": 9, "gge24": 9, "lan4b": 9, "lww": 9, "xun": 9, "hanyu": 9, "yezhaohui": 9, "shichao": 9, "simin": 9, "shunyu": 9, "feiyu": 9, "xiong": 9, "12599": 9, "llf": 9, "xieyang": 9, "frederick": 9, "fiannaca": 9, "terri": 9, "koo": 9, "dixon": 9, "ea": 9, "3613905": 9, "3650756": 9, "xuan": 9, "hai": 9, "nguyen": 9, "ngoc": 9, "tiviati": 9, "hieu": 9, "dao": 9, "shafiq": 9, "joti": 9, "kenji": 9, "kawaguchi": 9, "nanci": 9, "min": 9, "kan": 9, "08656": 9, "out24": 9, "twt": 9, "zhi": 9, "kuang": 9, "tsai": 9, "chieh": 9, "hung": 9, "nung": 9, "02442": 9, "tt24": 9, "vivien": 9, "vivien000": 9, "wl23": 9, "r\u00e9mi": 9, "09702": 9, "guidanceai24": 9, "nvidia4a": 9, "wikipediacontributors24": 9, "wiktionari": 9, "naur_form": 9}, "objects": {}, "objtypes": {}, "objnames": {}, "titleterms": {"about": [0, 2], "book": [0, 2], "content": [0, 3, 4, 5, 6, 7, 8, 9], "core": 0, "challeng": 0, "we": 0, "ll": 0, "address": 0, "A": [0, 2, 3, 4], "practic": [0, 2, 7, 9], "approach": [0, 4, 8], "an": 0, "open": [0, 2, 7], "sourc": [0, 2, 7], "note": [0, 3], "perspect": 0, "who": 0, "thi": 0, "i": [0, 3, 6], "For": 0, "outcom": 0, "prerequisit": 0, "set": 0, "up": 0, "your": [0, 7], "environ": 0, "code": 0, "repositori": 0, "python": 0, "setup": [0, 3], "api": [0, 8], "kei": [0, 5], "configur": 0, "troubleshoot": 0, "common": [0, 8], "issu": 0, "author": 0, "prefac": [1, 2], "tame": 2, "llm": [2, 4, 5, 7, 8], "guid": 2, "pitfal": [2, 8], "softwar": [2, 5], "chapter": 2, "1": [2, 8], "The": [2, 4, 5, 7], "eval": [2, 5, 8], "gap": [2, 5], "2": [2, 7, 8], "manag": [2, 6], "input": [2, 6], "data": [2, 3, 6], "3": [2, 8], "structur": [2, 6, 9], "output": [2, 9], "4": [2, 8], "safeti": [2, 8], "5": [2, 8], "prefer": [2, 3], "base": [2, 3, 5, 6, 8], "align": [2, 3], "6": [2, 8], "local": [2, 7], "7": 2, "fall": [2, 4], "cost": [2, 4, 7], "paradox": [2, 4], "8": 2, "frontier": 2, "appendix": 2, "tool": [2, 5, 7, 8, 9], "resourc": 2, "introduct": [3, 5, 6, 7, 8, 9], "from": 3, "raw": 3, "capabl": 3, "On": 3, "misalign": 3, "languag": 3, "model": [3, 5, 7], "human": 3, "supervis": 3, "fine": [3, 7, 9], "tune": [3, 7, 9], "sft": 3, "augment": [3, 6], "post": [3, 9], "train": 3, "answer": 3, "limit": 3, "collaps": 3, "fake": 3, "case": [3, 6, 7, 8], "studi": [3, 6, 7, 8], "polici": [3, 8], "experiment": 3, "deliver": 3, "smollm2": 3, "dataset": [3, 5, 7, 8], "synthet": 3, "gener": [3, 5, 6, 8], "user": [3, 8], "prompt": [3, 7, 9], "reject": 3, "respons": 3, "chosen": 3, "dpo": 3, "optim": [3, 4], "prepar": 3, "vibe": 3, "check": [3, 4], "evalu": [3, 5, 8], "discuss": [3, 6, 9], "conclus": [3, 4, 5, 6, 7, 8, 9], "citat": [3, 5, 6, 8, 9], "refer": [3, 4, 5, 6, 7, 8, 9], "why": 4, "matter": 4, "more": 4, "than": 4, "ever": 4, "right": 4, "size": 4, "strateg": 4, "metric": [4, 5], "requir": [4, 5], "busi": 4, "perform": [4, 7], "oper": 4, "technic": [4, 8], "quantiz": [4, 7], "list": 4, "non": 5, "determinist": 5, "machin": 5, "emerg": 5, "properti": 5, "problem": [5, 9], "statement": [5, 9], "tradit": 5, "v": [5, 7], "design": [5, 8], "applic": 5, "test": 5, "matrix": 5, "conceptu": 5, "overview": 5, "consider": 5, "task": [5, 7], "benchmark": [5, 7, 8], "leaderboard": 5, "lightev": 5, "mmlu": 5, "econometr": 5, "sampl": [5, 8], "famili": [5, 7], "us": [5, 6], "langsmith": 5, "promptfoo": 5, "comparison": [5, 7, 9], "pars": 6, "document": 6, "markitdown": 6, "docl": 6, "framework": [6, 8, 9], "extract": 6, "retriev": 6, "chunk": 6, "contextu": 6, "link": 6, "long": 6, "form": 6, "ii": 6, "github": 6, "rag": 6, "iii": 6, "quiz": 6, "implement": [6, 8], "exampl": 6, "usag": 6, "choos": 7, "suitabl": 7, "result": 7, "llama": 7, "licens": 7, "commun": 7, "support": 7, "custom": [7, 8], "mistral": [7, 8], "decemb": 7, "22": 7, "2024": 7, "deploy": 7, "serv": 7, "cpp": 7, "llamafil": 7, "ollama": [7, 9], "lama": 7, "ui": 7, "lm": 7, "studio": 7, "jan": 7, "webui": 7, "openwebui": 7, "effect": 7, "level": 7, "hardwar": 7, "takeawai": [7, 8], "risk": 8, "ai": 8, "amplifi": 8, "exist": 8, "harm": 8, "novel": 8, "associ": 8, "autonom": 8, "exacerb": 8, "factor": 8, "specif": 8, "guidanc": 8, "govern": 8, "organ": 8, "privat": 8, "sector": 8, "openai": 8, "anthrop": 8, "googl": 8, "rubric": 8, "mlcommon": 8, "centr": 8, "porquoi": 8, "red": 8, "team": 8, "constitut": 8, "explain": 8, "xai": 8, "plan": 8, "phase": 8, "definit": 8, "research": [8, 9], "identif": 8, "architectur": 8, "select": 8, "go": 8, "market": 8, "compon": 8, "salad": 8, "bench": 8, "truthfulqa": 8, "harmbench": 8, "safebench": 8, "techniqu": [8, 9], "repres": 8, "layer": 8, "map": 8, "rule": 8, "filter": 8, "moder": 8, "bad": 8, "good": 8, "guard": 8, "judg": 8, "valid": 8, "engin": 9, "json": 9, "mode": 9, "logit": 9, "process": 9, "outlin": 9, "langchain": 9, "best": 9, "compar": 9, "solut": 9, "ongo": 9, "debat": 9, "acknowledg": 9}, "envversion": {"sphinx.domains.c": 2, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 8, "sphinx.domains.index": 1, "sphinx.domains.javascript": 2, "sphinx.domains.math": 2, "sphinx.domains.python": 3, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinxcontrib.bibtex": 9, "sphinx": 57}, "alltitles": {"About the Book": [[0, "about-the-book"], [2, "about-the-book"]], "Contents": [[0, "contents"], [3, "contents"], [4, "contents"], [5, "contents"], [6, "contents"], [7, "contents"], [8, "contents"], [9, "contents"]], "Core Challenges We\u2019ll Address": [[0, "core-challenges-we-ll-address"]], "A Practical Approach": [[0, "a-practical-approach"]], "An Open Source Approach": [[0, "an-open-source-approach"]], "Open Source Book": [[0, "open-source-book"]], "A Note on Perspective": [[0, "a-note-on-perspective"]], "Who This Book Is For": [[0, "who-this-book-is-for"]], "Outcomes": [[0, "outcomes"]], "Prerequisites": [[0, "prerequisites"]], "Setting Up Your Environment": [[0, "setting-up-your-environment"]], "Code Repository": [[0, "code-repository"]], "Python Environment Setup": [[0, "python-environment-setup"]], "API Keys Configuration": [[0, "api-keys-configuration"]], "Troubleshooting Common Issues": [[0, "troubleshooting-common-issues"]], "About the Author": [[0, "about-the-author"]], "Preface": [[1, "preface"], [2, "preface"]], "Taming LLMs": [[2, "taming-llms"]], "A Practical Guide to LLM Pitfalls with Open Source Software": [[2, "a-practical-guide-to-llm-pitfalls-with-open-source-software"]], "Chapter 1: The Evals Gap": [[2, "chapter-1-the-evals-gap"]], "Chapter 2: Managing Input Data": [[2, "chapter-2-managing-input-data"]], "Chapter 3: Structured Output": [[2, "chapter-3-structured-output"]], "Chapter 4: Safety": [[2, "chapter-4-safety"]], "Chapter 5: Preference-Based Alignment": [[2, "chapter-5-preference-based-alignment"]], "Chapter 6: Local LLMs in Practice": [[2, "chapter-6-local-llms-in-practice"]], "Chapter 7: The Falling Cost Paradox": [[2, "chapter-7-the-falling-cost-paradox"]], "Chapter 8: Frontiers": [[2, "chapter-8-frontiers"]], "Appendix A: Tools and Resources": [[2, "appendix-a-tools-and-resources"]], "Preference-Based Alignment": [[3, "preference-based-alignment"]], "Introduction": [[3, "introduction"], [5, "introduction"], [6, "introduction"], [7, "introduction"], [8, "introduction"], [9, "introduction"]], "From Raw Capabilities to Preference Alignment": [[3, "from-raw-capabilities-to-preference-alignment"]], "On the Misalignment of Language Models": [[3, "on-the-misalignment-of-language-models"]], "Aligning Language Models with Human Preferences": [[3, "aligning-language-models-with-human-preferences"]], "Supervised Fine-Tuning (SFT) for Model Alignment": [[3, "supervised-fine-tuning-sft-for-model-alignment"]], "Augmenting SFT with Human Preferences": [[3, "augmenting-sft-with-human-preferences"]], "Is Post-Training the Answer?": [[3, "is-post-training-the-answer"]], "Limitations": [[3, "limitations"]], "Model Collapse": [[3, "model-collapse"]], "Faking Alignment": [[3, "faking-alignment"]], "Case Study: Aligning a Language Model to a Policy": [[3, "case-study-aligning-a-language-model-to-a-policy"]], "Experimental Setup": [[3, "experimental-setup"]], "Deliverables": [[3, "deliverables"]], "A Note on smolLM2 Models": [[3, "a-note-on-smollm2-models"]], "Policy": [[3, "policy"]], "Preference Dataset - Synthetic Dataset Generation": [[3, "preference-dataset-synthetic-dataset-generation"]], "User Prompts": [[3, "user-prompts"]], "Rejected Responses": [[3, "rejected-responses"]], "Chosen Responses": [[3, "chosen-responses"]], "Generate DPO Dataset": [[3, "generate-dpo-dataset"]], "DPO-Based Optimization": [[3, "dpo-based-optimization"]], "Data Preparation": [[3, "data-preparation"]], "Fine-Tuning": [[3, "fine-tuning"]], "Vibe Check": [[3, "vibe-check"]], "Alignment Evaluation": [[3, "alignment-evaluation"]], "Discussion and Conclusions": [[3, "discussion-and-conclusions"]], "Citation": [[3, "citation"], [5, "citation"], [8, "citation"], [9, "citation"]], "References": [[3, "references"], [4, "references"], [5, "references"], [6, "references"], [7, "references"], [8, "references"], [9, "references"]], "The Falling Cost Paradox": [[4, "the-falling-cost-paradox"]], "Why Optimization Matters More Than Ever": [[4, "why-optimization-matters-more-than-ever"]], "Right-Sizing LLMs: A Strategic Approach": [[4, "right-sizing-llms-a-strategic-approach"]], "Metrics": [[4, "metrics"], [5, "metrics"]], "Requirements": [[4, "requirements"]], "Business Requirements": [[4, "business-requirements"]], "Performance Requirements": [[4, "performance-requirements"]], "Operational Requirements": [[4, "operational-requirements"]], "Technical Requirements": [[4, "technical-requirements"]], "Quantization": [[4, "quantization"], [7, "quantization"]], "Check-list": [[4, "check-list"]], "Conclusion": [[4, "conclusion"], [5, "conclusion"], [6, "conclusion"], [7, "conclusion"], [8, "conclusion"], [9, "conclusion"]], "The Evals Gap": [[5, "the-evals-gap"]], "Non-Deterministic Generative Machines": [[5, "non-deterministic-generative-machines"]], "Emerging Properties": [[5, "emerging-properties"]], "Problem Statement": [[5, "problem-statement"], [9, "problem-statement"]], "Evals of Traditional Software vs LLMs": [[5, "evals-table"]], "Evals Design": [[5, "evals-design"]], "LLM Application Testing Requirements Matrix": [[5, "validation-requirements"]], "Conceptual Overview": [[5, "conceptual-overview"]], "Design Considerations": [[5, "design-considerations"]], "Key Metrics for Evaluating Generative Tasks": [[5, "key-metrics"]], "Evaluators": [[5, "evaluators"]], "Model-Based Evaluation": [[5, "model-based-evaluation"]], "Evaluating Evaluators": [[5, "evaluating-evaluators"]], "Benchmarks and Leaderboards": [[5, "benchmarks-and-leaderboards"]], "Tools": [[5, "tools"], [9, "tools"]], "LightEval": [[5, "lighteval"]], "MMLU Econometrics Task Dataset sample": [[5, "mmlu-econometrics"]], "Model Families Evaluated Using LightEval": [[5, "model-families"]], "LangSmith": [[5, "langsmith"]], "PromptFoo": [[5, "promptfoo"]], "Comparison": [[5, "comparison"], [7, "comparison"], [7, "id37"]], "Comparison of Lighteval, LangSmith, and Promptfoo": [[5, "tool-comparison"]], "Managing Input Data": [[6, "managing-input-data"]], "Parsing Documents": [[6, "parsing-documents"]], "MarkItDown": [[6, "markitdown"]], "Docling": [[6, "docling"]], "Frameworks-Based Parsing": [[6, "frameworks-based-parsing"]], "Structured Data Extraction": [[6, "structured-data-extraction"]], "Retrieval-Augmented Generation": [[6, "retrieval-augmented-generation"]], "Case Studies": [[6, "case-studies"]], "Case Study I: Content Chunking with Contextual Linking": [[6, "case-study-i-content-chunking-with-contextual-linking"]], "Generating long-form content": [[6, "generating-long-form-content"]], "Discussion": [[6, "discussion"], [6, "id14"], [9, "discussion"]], "Case Study II: Github RAG": [[6, "case-study-ii-github-rag"]], "Case Study III: Quiz Generation with Citations": [[6, "case-study-iii-quiz-generation-with-citations"]], "Use Case": [[6, "use-case"]], "Implementation": [[6, "implementation"]], "Example Usage": [[6, "example-usage"]], "Local LLMs in Practice": [[7, "local-llms-in-practice"]], "Choosing your Model": [[7, "choosing-your-model"]], "Task Suitability": [[7, "task-suitability"]], "Benchmark results for Llama 2 family of models.": [[7, "llama2-benchmark"]], "Performance & Cost": [[7, "performance-cost"]], "Licensing": [[7, "licensing"]], "Open Source LLMs.": [[7, "open-source-llms"]], "Community Support": [[7, "community-support"]], "Customization": [[7, "customization"]], "Mistral fine-tuning costs as of December 22, 2024.": [[7, "mistral-costs"]], "Tools for Local LLM Deployment": [[7, "tools-for-local-llm-deployment"]], "Serving Models": [[7, "serving-models"]], "LLama.cpp": [[7, "llama-cpp"]], "Llamafile": [[7, "llamafile"]], "Ollama": [[7, "ollama"], [9, "ollama"]], "lama.cpp vs Ollama vs Llamafile Comparison": [[7, "feature-comparison-local"]], "UI": [[7, "ui"]], "LM Studio": [[7, "lm-studio"]], "Jan": [[7, "jan"]], "Open WebUI": [[7, "open-webui"]], "LM Studio vs Jan vs OpenWebUI Comparison": [[7, "feature-comparison-ui"]], "Case Study: The Effect of Quantization on LLM Performance": [[7, "case-study-the-effect-of-quantization-on-llm-performance"]], "Prompts Dataset": [[7, "prompts-dataset"]], "Quantization Levels": [[7, "quantization-levels"]], "Benchmarking": [[7, "benchmarking"], [8, "benchmarking"]], "Results": [[7, "results"]], "Quantization Benchmarks": [[7, "quantization-benchmarks"]], "Benchmarking Hardware": [[7, "benchmarking-hardware"]], "Takeaways": [[7, "takeaways"], [8, "takeaways"]], "Safety": [[8, "safety"]], "Safety Risks": [[8, "safety-risks"]], "General AI Safety Risks": [[8, "general-ai-safety-risks"]], "Amplified Existing Harms and Novel Risks": [[8, "amplified-existing-harms-and-novel-risks"]], "Risks Associated with Autonomous AI": [[8, "risks-associated-with-autonomous-ai"]], "Exacerbating Factors": [[8, "exacerbating-factors"]], "LLMs Specific Safety Risks": [[8, "llms-specific-safety-risks"]], "Guidance": [[8, "guidance"]], "Governments & Organizations": [[8, "governments-organizations"]], "Private Sector": [[8, "private-sector"]], "OpenAI": [[8, "openai"]], "Anthropic": [[8, "anthropic"]], "Google": [[8, "google"]], "Rubrics": [[8, "rubrics"]], "MLCommons AI Safety Benchmark": [[8, "mlcommons-ai-safety-benchmark"]], "Centre for the Governance of AI Rubric": [[8, "centre-for-the-governance-of-ai-rubric"]], "Porquoi": [[8, "porquoi"]], "Approaches": [[8, "approaches"]], "Red Teaming": [[8, "red-teaming"]], "Constitutional AI": [[8, "constitutional-ai"]], "Explainable AI (XAI)": [[8, "explainable-ai-xai"]], "Designing a Safety Plan": [[8, "designing-a-safety-plan"]], "Phase 1. Policy Definition": [[8, "phase-1-policy-definition"]], "Phase 2. User Research & Risk Identification": [[8, "phase-2-user-research-risk-identification"]], "Phase 3. Evaluation Framework": [[8, "phase-3-evaluation-framework"]], "Phase 4. Safety Architecture Design": [[8, "phase-4-safety-architecture-design"]], "Phase 5. Implementation & Tools Selection": [[8, "phase-5-implementation-tools-selection"]], "Phase 6. Go-to-Market": [[8, "phase-6-go-to-market"]], "Common Pitfalls": [[8, "common-pitfalls"]], "Technical Implementation Components": [[8, "technical-implementation-components"]], "Benchmarks & Datasets": [[8, "benchmarks-datasets"]], "SALAD-Bench": [[8, "salad-bench"]], "TruthfulQA": [[8, "truthfulqa"]], "HarmBench": [[8, "harmbench"]], "SafeBench": [[8, "safebench"]], "Tools & Techniques": [[8, "tools-techniques"]], "Representative Safety Layer Risk Map.": [[8, "safety-layer-table"]], "Rules-Based Safety Filtering": [[8, "rules-based-safety-filtering"]], "Rules-Based Safety Filtering Tools.": [[8, "safety-layer-tools"]], "LLM-Based Safety Filtering": [[8, "llm-based-safety-filtering"]], "Custom Moderation": [[8, "custom-moderation"]], "Case Study: Implementing a Safety Filter": [[8, "case-study-implementing-a-safety-filter"]], "Evals Dataset": [[8, "evals-dataset"]], "Bad Samples": [[8, "bad-samples"]], "Good Samples": [[8, "good-samples"]], "Safety Filters": [[8, "safety-filters"]], "LLM-Guard": [[8, "llm-guard"]], "Mistral Moderation API": [[8, "mistral-moderation-api"]], "OpenAI Moderation API": [[8, "openai-moderation-api"]], "Custom Judge Validator": [[8, "custom-judge-validator"]], "Structured Output": [[9, "structured-output"]], "Techniques": [[9, "techniques"]], "Prompt Engineering": [[9, "prompt-engineering"]], "JSON Mode (Fine-Tuned)": [[9, "json-mode-fine-tuned"]], "Logit Post-Processing": [[9, "logit-post-processing"]], "Outlines": [[9, "outlines"]], "LangChain": [[9, "langchain"]], "Best Practices": [[9, "best-practices"]], "Comparing Solutions": [[9, "comparing-solutions"]], "Structured Output Frameworks Comparison": [[9, "structured-output-frameworks"]], "Research and Ongoing Debate": [[9, "research-and-ongoing-debate"]], "Acknowledgements": [[9, "acknowledgements"]]}, "indexentries": {}})
\ No newline at end of file
diff --git a/tamingllms/_build/jupyter_execute/markdown/intro.ipynb b/tamingllms/_build/jupyter_execute/markdown/intro.ipynb
index d8a876a..872feda 100644
--- a/tamingllms/_build/jupyter_execute/markdown/intro.ipynb
+++ b/tamingllms/_build/jupyter_execute/markdown/intro.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "markdown",
-   "id": "1855ac0a",
+   "id": "70924298",
    "metadata": {},
    "source": [
     "(intro)=\n",
diff --git a/tamingllms/_build/jupyter_execute/notebooks/input.ipynb b/tamingllms/_build/jupyter_execute/notebooks/input.ipynb
new file mode 100644
index 0000000..6117e9d
--- /dev/null
+++ b/tamingllms/_build/jupyter_execute/notebooks/input.ipynb
@@ -0,0 +1,2454 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "(input)=\n",
+    "# Managing Input Data\n",
+    "```{epigraph}\n",
+    "One home run is much better than two doubles.\n",
+    "\n",
+    "-- Steve Jobs\n",
+    "```\n",
+    "```{contents}\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Introduction\n",
+    "\n",
+    "Large Language Models face several critical challenges in effectively processing input data. While advances in long-context language models (LCLMs) {cite}`lee2024longcontextlanguagemodelssubsume` have expanded the amount of information these systems can process simultaneously, significant challenges remain in managing and effectively utilizing extended inputs. \n",
+    "\n",
+    "LLMs are sensitive to input formatting and structure, requiring careful data preparation to achieve optimal results {cite}`tan2024htmlraghtmlbetterplain`. They operate with knowledge cutoffs, providing potentially stale or outdated information that may not reflect current reality and demonstrate problems with temporal knowledge accuracy {cite}`amayuelas-etal-2024-knowledge`. LLMs also struggle with less common but important information showing a systematic loss of long-tail knowledge {cite}`kotha2024understanding`.\n",
+    "\n",
+    "Motivated by these challenges, this chapter explores two key components:\n",
+    "\n",
+    "1. Data Parsing: Parsing documents into a unified format that is suitable for LLMs to process.\n",
+    "2. Retrieval Augmentation: Augmenting LLMs with the ability to retrieve relevant, recent, and specialized information.\n",
+    "\n",
+    "In data parsing, we will explore some useful open source tools that help transform data into LLM-compatible formats, demonstrating their impact through a case study of structured information extraction from complex PDFs. In a second case study, we will introduce some chunking strategies to help LLMs process long inputs and implement a particular technique called Chunking with Contextual Linking the enables contextually relevant chunk processing.\n",
+    "\n",
+    "In retrieval augmentation, we will explore how to enhance LLMs with semantic search capabilities for incorporating external context using RAGs (Retrieval Augmented Generation). Through a detailed case study, we build a RAG system for querying live codebases, illustrating methods to bridge static model knowledge with dynamic information requirements.\n",
+    "\n",
+    "In our last case study, we build a quiz generator using a LLM with large context window. We will explore some additional relevant techniques such as prompt caching and response verification through citations.\n",
+    "\n",
+    "By the chapter's conclusion, readers will possess relevant knowledge of input data management strategies for LLMs and practical expertise in selecting and implementing appropriate approaches and tools for specific use cases."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Parsing Documents\n",
+    "\n",
+    "Building robust data ingestion and preprocessing pipelines is essential for any LLM application. This section explores tools and frameworks that streamline input data processing, in particular for parsing purposes, providing a unified interface for converting diverse data formats into standardized representations that LLMs can effectively process. By abstracting away format-specific complexities, they allow developers to focus on core application logic rather than parsing implementation details while maximizing the performance of the LLM.\n",
+    "\n",
+    "We will cover open source tools and frameworks that provide parsing capabilities for a wide range of data formats. And we will demonstrate how some of these tools can be used to extract structured information from complex PDFs also discussing how the quality of the parser can impact LLM's performance."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### MarkItDown\n",
+    "\n",
+    "MarkItDown is a Python package and CLI too developed by the Microsoft AutoGen team for converting various file formats to Markdown. It supports a wide range of formats including PDF, PowerPoint, Word, Excel, images (with OCR and EXIF metadata), audio (with transcription), HTML, and other text-based formats making it a useful tool for document indexing and LLM-based applications.\n",
+    "\n",
+    "Key features:\n",
+    "- Simple command-line and Python API interfaces\n",
+    "- Support for multiple file formats\n",
+    "- Optional LLM integration for enhanced image descriptions\n",
+    "- Batch processing capabilities\n",
+    "- Docker support for containerized usage\n",
+    "\n",
+    "Sample usage:\n",
+    "```python\n",
+    "from markitdown import MarkItDown\n",
+    "\n",
+    "md = MarkItDown()\n",
+    "result = md.convert(\"test.xlsx\")\n",
+    "print(result.text_content)\n",
+    "```\n",
+    "\n",
+    "### Docling\n",
+    "\n",
+    "Docling is a Python package developed by IBM Research for parsing and converting documents into various formats. It provides advanced document understanding capabilities with a focus on maintaining document structure and formatting.\n",
+    "\n",
+    "Key features:\n",
+    "- Support for multiple document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, etc.)\n",
+    "- Advanced PDF parsing including layout analysis and table extraction\n",
+    "- Unified document representation format\n",
+    "- Integration with LlamaIndex and LangChain\n",
+    "- OCR support for scanned documents\n",
+    "- Simple CLI interface\n",
+    "\n",
+    "Sample usage:\n",
+    "```python\n",
+    "from docling.document_converter import DocumentConverter\n",
+    "\n",
+    "converter = DocumentConverter()\n",
+    "result = converter.convert(\"document.pdf\")\n",
+    "print(result.document.export_to_markdown())\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Frameworks-Based Parsing\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Structured Data Extraction"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "A common use case where document parsing matters is to structured data extraction from documents, particularly in the presence of complex formatting and layout. In this case study, we will extract the economic forecasts from Merrill Lynch's CIO Capital Market Outlook released on December 16, 2024 {cite:p}`merrill2024`. We will focus on page 7 of this document, which contains several economic variables organized in a mix of tables, text and images (see {numref}`forecast`)\n",
+    "\n",
+    "\n",
+    "```{figure} ../data/input/forecast.png\n",
+    "---\n",
+    "name: forecast\n",
+    "alt: Forecast\n",
+    "scale: 50%\n",
+    "align: center\n",
+    "---\n",
+    "Forecast\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 76,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "FORECAST_FILE_PATH = \"../data/input/forecast.pdf\"\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "First, we will use MarkItDown to extract the text content from the document."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 83,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from markitdown import MarkItDown\n",
+    "\n",
+    "md = MarkItDown()\n",
+    "result_md = md.convert(FORECAST_FILE_PATH).text_content"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Next, we will do the same with Docling."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from docling.document_converter import DocumentConverter\n",
+    "\n",
+    "converter = DocumentConverter()\n",
+    "forecast_result_docling = converter.convert(source).document.export_to_markdown()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "How similar are the two results? We can use use Levenshtein distance to measure the similarity between the two results. We will also calculate a naive score using the `SequenceMatcher` from the `difflib` package, which is a simple measure of the similarity between two strings based on the number of matches in the longest common subsequence."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 79,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import Levenshtein\n",
+    "def levenshtein_similarity(text1: str, text2: str) -> float:\n",
+    "    \"\"\"\n",
+    "    Calculate normalized Levenshtein distance\n",
+    "    Returns value between 0 (completely different) and 1 (identical)\n",
+    "    \"\"\"\n",
+    "    distance = Levenshtein.distance(text1, text2)\n",
+    "    max_len = max(len(text1), len(text2))\n",
+    "    return 1 - (distance / max_len)\n",
+    "\n",
+    "from difflib import SequenceMatcher\n",
+    "def simple_similarity(text1: str, text2: str) -> float:\n",
+    "    \"\"\"\n",
+    "    Calculate similarity ratio using SequenceMatcher\n",
+    "    Returns value between 0 (completely different) and 1 (identical)\n",
+    "    \"\"\"\n",
+    "    return SequenceMatcher(None, text1, text2).ratio()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 80,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.13985705461925346"
+      ]
+     },
+     "execution_count": 80,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "levenshtein_similarity(forecast_result_md, forecast_result_docling)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 81,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.17779960707269155"
+      ]
+     },
+     "execution_count": 81,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "simple_similarity(forecast_result_md, forecast_result_docling)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "It turns out that the two results are quite different, with a similarity score of about 13.98% and 17.77% for Levenshtein and `SequenceMatcher` respectively."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Docling's result is a quite readable markdown displaying key economic variables and their forecasts. Conversely, MarkItDown's result is a bit messy and hard to read but the information is there just not in a structured format. Does it matter? That's what we will explore next."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Docling's result**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "display(Markdown(forecast_result_docling))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "{numref}`docling` shows part of the parsed result from Docling."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "```{figure} ../_static/input/docling.png\n",
+    "---\n",
+    "name: docling\n",
+    "alt: Docling's result\n",
+    "scale: 60%\n",
+    "align: center\n",
+    "---\n",
+    "Docling's parsed result\n",
+    "```\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**MarkItDown's result**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from IPython.display import display, Markdown\n",
+    "display(Markdown(forecast_result_md[:500]))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "{numref}`markitdown` shows part of the parsed result from MarkItDown.\n",
+    "\n",
+    "```{figure} ../_static/input/markitdown.png\n",
+    "---\n",
+    "name: markitdown\n",
+    "alt: MarkItDown's parsed result\n",
+    "scale: 60%\n",
+    "align: center\n",
+    "---\n",
+    "MarkItDown's parsed result\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now, let's focus on the economic forecasts. In particular, we are interested in extracting the CIO's 2025E forecasts.\n",
+    "\n",
+    "```{figure} ../_static/input/2025.png\n",
+    "---\n",
+    "name: forecast2025\n",
+    "alt: Forecast 2025\n",
+    "scale: 45%\n",
+    "align: center\n",
+    "---\n",
+    "Forecast 2025\n",
+    "```\n",
+    "\n",
+    "We will define a `Forecast` pydantic model to represent an economic forecast composed of a `financial_variable` and a `financial_forecast`. We will also define a `EconForecast` pydantic model to represent the list of economic forecasts we want to extract from the document.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pydantic import BaseModel\n",
+    "class Forecast(BaseModel):\n",
+    "    financial_variable: str\n",
+    "    financial_forecast: float\n",
+    "class EconForecast(BaseModel):\n",
+    "    forecasts: list[Forecast]\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We write a simple function to extract the economic forecasts from the document using an LLM model (with structured output) with the following prompt template, where `extract_prompt` is kind of data the user would like to extract and `doc` is the input document to analyze."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "```python\n",
+    "BASE_PROMPT = f\"\"\"\n",
+    "    ROLE: You are an expert at structured data extraction. \n",
+    "    TASK: Extract the following data {extract_prompt} from input DOCUMENT\n",
+    "    FORMAT: The output should be a JSON object with 'financial_variable' as key and 'financial_forecast' as value.\n",
+    "    \"\"\"\n",
+    "prompt = f\"{BASE_PROMPT} \\n\\n DOCUMENT: {doc}\"\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 84,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def extract_from_doc(extract_prompt: str,  doc: str, client) -> EconForecast:\n",
+    "    \"\"\"\n",
+    "    Extract data of a financial document using an LLM model.\n",
+    "    \n",
+    "    Args:\n",
+    "        doc: The financial document text to analyze\n",
+    "        client: The LLM model to use for analysis\n",
+    "        extract_prompt: The prompt to use for extraction\n",
+    "        \n",
+    "    Returns:\n",
+    "        EconForecasts object containing sentiment analysis results\n",
+    "    \"\"\"\n",
+    "\n",
+    "    BASE_PROMPT = f\"\"\"\n",
+    "    ROLE: You are an expert at structured data extraction. \n",
+    "    TASK: Extract the following data {extract_prompt} from input DOCUMENT\n",
+    "    FORMAT: The output should be a JSON object with 'financial_variable' as key and 'financial_forecast' as value.\n",
+    "    \"\"\"\n",
+    "    prompt = f\"{BASE_PROMPT} \\n\\n DOCUMENT: {doc}\"\n",
+    "    completion = client.beta.chat.completions.parse(\n",
+    "        model=\"gpt-4o-mini\",\n",
+    "        messages=[\n",
+    "            {\n",
+    "                \"role\": \"system\",\n",
+    "                \"content\": prompt\n",
+    "            },\n",
+    "            {\"role\": \"user\", \"content\": doc}\n",
+    "        ],\n",
+    "        response_format=EconForecast\n",
+    "    )\n",
+    "    return completion.choices[0].message.parsed"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from dotenv import load_dotenv\n",
+    "import os\n",
+    "\n",
+    "# Load environment variables from .env file\n",
+    "load_dotenv(override=True)\n",
+    "from openai import OpenAI\n",
+    "client = OpenAI()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The user then calls the `extract_from_doc` function simply defining that \"Economic Forecasts for 2025E\" is the data they would like to extract from the document. We perform the extraction twice, once with MarkItDown and once with Docling."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "extract_prompt = \"Economic Forecasts for 2025E\"\n",
+    "md_financials = extract_from_doc(extract_prompt, forecast_result_md, client)\n",
+    "docling_financials = extract_from_doc(extract_prompt, forecast_result_docling, client)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The response is an `EconForecast` object containing a list of `Forecast` objects, as defined in the pydantic model. We can then convert the response to a pandas DataFrame for easier comparison."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 99,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "EconForecast(forecasts=[Forecast(financial_variable='Real global GDP (% y/y annualized)', financial_forecast=3.2), Forecast(financial_variable='Real U.S. GDP (% q/q annualized)', financial_forecast=2.4), Forecast(financial_variable='CPI inflation (% y/y)', financial_forecast=2.5), Forecast(financial_variable='Core CPI inflation (% y/y)', financial_forecast=3.0), Forecast(financial_variable='Unemployment rate (%)', financial_forecast=4.3), Forecast(financial_variable='Fed funds rate, end period (%)', financial_forecast=3.88)])"
+      ]
+     },
+     "execution_count": 99,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "md_financials"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_md_forecasts = pd.DataFrame([(f.financial_variable, f.financial_forecast) for f in md_financials.forecasts], \n",
+    "                      columns=['Variable', 'Forecast'])\n",
+    "df_docling_forecasts = pd.DataFrame([(f.financial_variable, f.financial_forecast) for f in docling_financials.forecasts], \n",
+    "                      columns=['Variable', 'Forecast'])\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 97,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Variable</th>\n",
+       "      <th>Forecast</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Real global GDP (% y/y annualized)</td>\n",
+       "      <td>3.20</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Real U.S. GDP (% q/q annualized)</td>\n",
+       "      <td>2.40</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>CPI inflation (% y/y)</td>\n",
+       "      <td>2.50</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Core CPI inflation (% y/y)</td>\n",
+       "      <td>3.00</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Unemployment rate (%)</td>\n",
+       "      <td>4.30</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>Fed funds rate, end period (%)</td>\n",
+       "      <td>3.88</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                             Variable  Forecast\n",
+       "0  Real global GDP (% y/y annualized)      3.20\n",
+       "1    Real U.S. GDP (% q/q annualized)      2.40\n",
+       "2               CPI inflation (% y/y)      2.50\n",
+       "3          Core CPI inflation (% y/y)      3.00\n",
+       "4               Unemployment rate (%)      4.30\n",
+       "5      Fed funds rate, end period (%)      3.88"
+      ]
+     },
+     "execution_count": 97,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_md_forecasts"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 98,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Variable</th>\n",
+       "      <th>Forecast</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Real global GDP (% y/y annualized)</td>\n",
+       "      <td>3.20</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Real U.S. GDP (% q/q annualized)</td>\n",
+       "      <td>2.40</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>CPI inflation (% y/y)</td>\n",
+       "      <td>2.50</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Core CPI inflation (% y/y)</td>\n",
+       "      <td>3.00</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Unemployment rate (%)</td>\n",
+       "      <td>4.30</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>Fed funds rate, end period (%)</td>\n",
+       "      <td>3.88</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                             Variable  Forecast\n",
+       "0  Real global GDP (% y/y annualized)      3.20\n",
+       "1    Real U.S. GDP (% q/q annualized)      2.40\n",
+       "2               CPI inflation (% y/y)      2.50\n",
+       "3          Core CPI inflation (% y/y)      3.00\n",
+       "4               Unemployment rate (%)      4.30\n",
+       "5      Fed funds rate, end period (%)      3.88"
+      ]
+     },
+     "execution_count": 98,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_docling_forecasts"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The results from MarkItDown and Docling are identical and accurately match the true values from the document. This demonstrates that despite MarkItDown's output appearing less readable from a human perspective, both approaches enabled the LLM to successfully extract the economic forecast data with equal accuracy, in this particular case."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now, let's focus on the asset class weightings. We will extract the asset class weightings from the document and compare the results from MarkItDown and Docling. The information now is presented in a quite different structure. The CIO view information is represented in a spectrum from starting with \"Underweight\", passing through \"Neutral\" and reaching \"Overweight\". The actual view is marked by some colored dots in the chart. Let's see if we can extract this information from the document.\n",
+    "```{figure} ../_static/input/asset_class.png\n",
+    "---\n",
+    "name: asset_class\n",
+    "alt: Asset Class Weightings\n",
+    "scale: 50%\n",
+    "align: center\n",
+    "---\n",
+    "Asset Class Weightings\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The user will simply define the following data to extract: \"Asset Class Weightings (as of 12/3/2024) in a scale from -2 to 2\". In that way, we expect that \"Underweight\" will be mapped to -2, \"Neutral\" to 0 and \"Overweight\" to 2 with some values in between."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "extract_prompt = \"Asset Class Weightings (as of 12/3/2024) in a scale from -2 to 2\"\n",
+    "asset_class_docling = extract_from_doc(extract_prompt, forecast_result_docling, client)\n",
+    "asset_class_md = extract_from_doc(extract_prompt, forecast_result_md, client)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "df_md = pd.DataFrame([(f.financial_variable, f.financial_forecast) for f in asset_class_md.forecasts], \n",
+    "                 columns=['Variable', 'Forecast'])\n",
+    "df_docling = pd.DataFrame([(f.financial_variable, f.financial_forecast) for f in asset_class_docling.forecasts], \n",
+    "                 columns=['Variable', 'Forecast'])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now we construct a DataFrame to compare the results from MarkItDown and Docling with an added \"true_value\" column containing the true values from the document, which we extracted manually from the chart."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 72,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>variable</th>\n",
+       "      <th>markitdown</th>\n",
+       "      <th>docling</th>\n",
+       "      <th>true_value</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Global Equities</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>U.S. Large Cap Growth</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>U.S. Large Cap Value</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>U.S. Small Cap Growth</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>U.S. Small Cap Value</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>International Developed</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>-1.0</td>\n",
+       "      <td>-1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>Emerging Markets</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>Global Fixed Income</td>\n",
+       "      <td>-1.0</td>\n",
+       "      <td>-1.0</td>\n",
+       "      <td>-1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>U.S. Governments</td>\n",
+       "      <td>-1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>U.S. Mortgages</td>\n",
+       "      <td>-1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>U.S. Corporates</td>\n",
+       "      <td>-1.0</td>\n",
+       "      <td>-1.0</td>\n",
+       "      <td>-1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11</th>\n",
+       "      <td>International Fixed Income</td>\n",
+       "      <td>-1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>12</th>\n",
+       "      <td>High Yield</td>\n",
+       "      <td>-1.0</td>\n",
+       "      <td>-1.0</td>\n",
+       "      <td>-1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>13</th>\n",
+       "      <td>U.S. Investment-grade</td>\n",
+       "      <td>-1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>14</th>\n",
+       "      <td>Tax Exempt U.S. High Yield Tax Exempt</td>\n",
+       "      <td>-1.0</td>\n",
+       "      <td>-1.0</td>\n",
+       "      <td>-1.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                 variable  markitdown  docling  true_value\n",
+       "0                         Global Equities         1.0      1.0         1.0\n",
+       "1                   U.S. Large Cap Growth         1.0      1.0         0.0\n",
+       "2                    U.S. Large Cap Value         1.0      1.0         1.0\n",
+       "3                   U.S. Small Cap Growth         1.0      1.0         1.0\n",
+       "4                    U.S. Small Cap Value         1.0      1.0         1.0\n",
+       "5                 International Developed         1.0     -1.0        -1.0\n",
+       "6                        Emerging Markets         1.0      0.0         0.0\n",
+       "7                     Global Fixed Income        -1.0     -1.0        -1.0\n",
+       "8                        U.S. Governments        -1.0      1.0         1.0\n",
+       "9                          U.S. Mortgages        -1.0      1.0         1.0\n",
+       "10                        U.S. Corporates        -1.0     -1.0        -1.0\n",
+       "11             International Fixed Income        -1.0      0.0         0.0\n",
+       "12                             High Yield        -1.0     -1.0        -1.0\n",
+       "13                  U.S. Investment-grade        -1.0      0.0         0.0\n",
+       "14  Tax Exempt U.S. High Yield Tax Exempt        -1.0     -1.0        -1.0"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "# Create DataFrame with specified columns\n",
+    "df_comparison = pd.DataFrame({\n",
+    "    'variable': df_docling['Variable'].iloc[:-1],\n",
+    "    'markitdown': df_md['Forecast'],\n",
+    "    'docling': df_docling['Forecast'].iloc[:-1],  # Drop last row\n",
+    "    'true_value': [1.0, 0.0, 1.0, 1.0, 1.0, -1.0, 0.0, -1.0, 1.0, 1.0, -1.0, 0.0, -1.0, 0.0, -1.0]\n",
+    "})\n",
+    "\n",
+    "display(df_comparison)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 73,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Markitdown accuracy: 53.33%\n",
+      "Docling accuracy: 93.33%\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Calculate accuracy for markitdown and docling\n",
+    "markitdown_accuracy = (df_comparison['markitdown'] == df_comparison['true_value']).mean()\n",
+    "docling_accuracy = (df_comparison['docling'] == df_comparison['true_value']).mean()\n",
+    "\n",
+    "print(f\"Markitdown accuracy: {markitdown_accuracy:.2%}\")\n",
+    "print(f\"Docling accuracy: {docling_accuracy:.2%}\") \n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Docling performs significantly better at 93.33% accuracy missing only one value. MarkItDown achieves 53.33% accuracy, struggling with nuanced asset class weightings. In this case, Docling's structured parsed output did help the LLM to extract the information more accurately compared to MarkItDown's unstructured output. Hence, in this case, the strategy used to parse the data did impact the LLM's ability to extract the information. A more robust analysis would run data extraction on a large sample data a number of repeated runs to estimate error rates."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "What if we want to systematically extract all tables from the document? We can use Docling to do that by simply accessing the `tables` attribute of the `DocumentConverter` object.\n",
+    "\n",
+    "By doing that, we observe that Docling extracted 7 tables from the document. Exporting tables from top down and left to right in order of appearance in the document.\n",
+    "Below, we can see the first table successfully extracted for Equities forecasts, the second one for Fixed Income forecasts as well as the last table, which contains CIO Equity Sector Views.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 47,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import time\n",
+    "from pathlib import Path\n",
+    "import pandas as pd\n",
+    "from docling.document_converter import DocumentConverter"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 50,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def convert_and_export_tables(file_path: Path) -> list[pd.DataFrame]:\n",
+    "    \"\"\"\n",
+    "    Convert document and export tables to DataFrames.\n",
+    "    \n",
+    "    Args:\n",
+    "        file_path: Path to input document\n",
+    "        \n",
+    "    Returns:\n",
+    "        List of pandas DataFrames containing the tables\n",
+    "    \"\"\"\n",
+    "    doc_converter = DocumentConverter()\n",
+    "    start_time = time.time()\n",
+    "    \n",
+    "    conv_res = doc_converter.convert(file_path)\n",
+    "    \n",
+    "    tables = []\n",
+    "    # Export tables\n",
+    "    for table in conv_res.document.tables:\n",
+    "        table_df: pd.DataFrame = table.export_to_dataframe()\n",
+    "        tables.append(table_df)\n",
+    "\n",
+    "    end_time = time.time() - start_time\n",
+    "    print(f\"Document converted in {end_time:.2f} seconds.\")\n",
+    "    \n",
+    "    return tables\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Convert and export tables\n",
+    "tables = convert_and_export_tables(Path(FORECAST_FILE_PATH))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 100,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "7"
+      ]
+     },
+     "execution_count": 100,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(tables)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 59,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th>Total Return in USD (%).Current</th>\n",
+       "      <th>Total Return in USD (%).WTD</th>\n",
+       "      <th>Total Return in USD (%).MTD</th>\n",
+       "      <th>Total Return in USD (%).YTD</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>DJIA</td>\n",
+       "      <td>43,828.06</td>\n",
+       "      <td>-1.8</td>\n",
+       "      <td>-2.3</td>\n",
+       "      <td>18.4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>NASDAQ</td>\n",
+       "      <td>19,926.72</td>\n",
+       "      <td>0.4</td>\n",
+       "      <td>3.7</td>\n",
+       "      <td>33.7</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>S&amp;P 500</td>\n",
+       "      <td>6,051.09</td>\n",
+       "      <td>-0.6</td>\n",
+       "      <td>0.4</td>\n",
+       "      <td>28.6</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>S&amp;P 400 Mid Cap</td>\n",
+       "      <td>3,277.20</td>\n",
+       "      <td>-1.6</td>\n",
+       "      <td>-2.6</td>\n",
+       "      <td>19.5</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Russell 2000</td>\n",
+       "      <td>2,346.90</td>\n",
+       "      <td>-2.5</td>\n",
+       "      <td>-3.5</td>\n",
+       "      <td>17.3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>MSCI World</td>\n",
+       "      <td>3,817.24</td>\n",
+       "      <td>-1.0</td>\n",
+       "      <td>0.2</td>\n",
+       "      <td>22.1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>MSCI EAFE</td>\n",
+       "      <td>2,319.05</td>\n",
+       "      <td>-1.5</td>\n",
+       "      <td>0.2</td>\n",
+       "      <td>6.4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>MSCI Emerging Markets</td>\n",
+       "      <td>1,107.01</td>\n",
+       "      <td>0.3</td>\n",
+       "      <td>2.7</td>\n",
+       "      <td>10.6</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                         Total Return in USD (%).Current  \\\n",
+       "0                   DJIA                       43,828.06   \n",
+       "1                 NASDAQ                       19,926.72   \n",
+       "2                S&P 500                        6,051.09   \n",
+       "3        S&P 400 Mid Cap                        3,277.20   \n",
+       "4           Russell 2000                        2,346.90   \n",
+       "5             MSCI World                        3,817.24   \n",
+       "6              MSCI EAFE                        2,319.05   \n",
+       "7  MSCI Emerging Markets                        1,107.01   \n",
+       "\n",
+       "  Total Return in USD (%).WTD Total Return in USD (%).MTD  \\\n",
+       "0                        -1.8                        -2.3   \n",
+       "1                         0.4                         3.7   \n",
+       "2                        -0.6                         0.4   \n",
+       "3                        -1.6                        -2.6   \n",
+       "4                        -2.5                        -3.5   \n",
+       "5                        -1.0                         0.2   \n",
+       "6                        -1.5                         0.2   \n",
+       "7                         0.3                         2.7   \n",
+       "\n",
+       "  Total Return in USD (%).YTD  \n",
+       "0                        18.4  \n",
+       "1                        33.7  \n",
+       "2                        28.6  \n",
+       "3                        19.5  \n",
+       "4                        17.3  \n",
+       "5                        22.1  \n",
+       "6                         6.4  \n",
+       "7                        10.6  "
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "display(tables[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 102,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th>Total Return in USD (%).Current</th>\n",
+       "      <th>Total Return in USD (%).WTD</th>\n",
+       "      <th>Total Return in USD (%).MTD</th>\n",
+       "      <th>Total Return in USD (%).YTD</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Corporate &amp; Government</td>\n",
+       "      <td>4.66</td>\n",
+       "      <td>-1.34</td>\n",
+       "      <td>-0.92</td>\n",
+       "      <td>1.94</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Agencies</td>\n",
+       "      <td>4.54</td>\n",
+       "      <td>-0.58</td>\n",
+       "      <td>-0.31</td>\n",
+       "      <td>3.35</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Municipals</td>\n",
+       "      <td>3.55</td>\n",
+       "      <td>-0.87</td>\n",
+       "      <td>-0.54</td>\n",
+       "      <td>1.99</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>U.S. Investment Grade Credit</td>\n",
+       "      <td>4.79</td>\n",
+       "      <td>-1.38</td>\n",
+       "      <td>-0.93</td>\n",
+       "      <td>1.97</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>International</td>\n",
+       "      <td>5.17</td>\n",
+       "      <td>-1.40</td>\n",
+       "      <td>-0.90</td>\n",
+       "      <td>3.20</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>High Yield</td>\n",
+       "      <td>7.19</td>\n",
+       "      <td>-0.22</td>\n",
+       "      <td>0.20</td>\n",
+       "      <td>8.87</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>90 Day Yield</td>\n",
+       "      <td>4.32</td>\n",
+       "      <td>4.39</td>\n",
+       "      <td>4.49</td>\n",
+       "      <td>5.33</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>2 Year Yield</td>\n",
+       "      <td>4.24</td>\n",
+       "      <td>4.10</td>\n",
+       "      <td>4.15</td>\n",
+       "      <td>4.25</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>10 Year Yield</td>\n",
+       "      <td>4.40</td>\n",
+       "      <td>4.15</td>\n",
+       "      <td>4.17</td>\n",
+       "      <td>3.88</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>30 Year Yield</td>\n",
+       "      <td>4.60</td>\n",
+       "      <td>4.34</td>\n",
+       "      <td>4.36</td>\n",
+       "      <td>4.03</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                Total Return in USD (%).Current  \\\n",
+       "0        Corporate & Government                            4.66   \n",
+       "1                      Agencies                            4.54   \n",
+       "2                    Municipals                            3.55   \n",
+       "3  U.S. Investment Grade Credit                            4.79   \n",
+       "4                 International                            5.17   \n",
+       "5                    High Yield                            7.19   \n",
+       "6                  90 Day Yield                            4.32   \n",
+       "7                  2 Year Yield                            4.24   \n",
+       "8                 10 Year Yield                            4.40   \n",
+       "9                 30 Year Yield                            4.60   \n",
+       "\n",
+       "  Total Return in USD (%).WTD Total Return in USD (%).MTD  \\\n",
+       "0                       -1.34                       -0.92   \n",
+       "1                       -0.58                       -0.31   \n",
+       "2                       -0.87                       -0.54   \n",
+       "3                       -1.38                       -0.93   \n",
+       "4                       -1.40                       -0.90   \n",
+       "5                       -0.22                        0.20   \n",
+       "6                        4.39                        4.49   \n",
+       "7                        4.10                        4.15   \n",
+       "8                        4.15                        4.17   \n",
+       "9                        4.34                        4.36   \n",
+       "\n",
+       "  Total Return in USD (%).YTD  \n",
+       "0                        1.94  \n",
+       "1                        3.35  \n",
+       "2                        1.99  \n",
+       "3                        1.97  \n",
+       "4                        3.20  \n",
+       "5                        8.87  \n",
+       "6                        5.33  \n",
+       "7                        4.25  \n",
+       "8                        3.88  \n",
+       "9                        4.03  "
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "display(tables[1])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 61,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Sector</th>\n",
+       "      <th>CIO View.</th>\n",
+       "      <th>CIO View.Underweight</th>\n",
+       "      <th>CIO View.Neutral</th>\n",
+       "      <th>CIO View.</th>\n",
+       "      <th>CIO View.Overweight</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Utilities</td>\n",
+       "      <td>slight over weight green   </td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Financials</td>\n",
+       "      <td>slight over weight green   </td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Healthcare</td>\n",
+       "      <td>slight over weight green   </td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Consumer  Discretionary</td>\n",
+       "      <td>Slight over weight green  </td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Information  Technology</td>\n",
+       "      <td>Neutral yellow  </td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>Communication  Services</td>\n",
+       "      <td>Neutral yellow  </td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>Industrials</td>\n",
+       "      <td>Neutral yellow  </td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>Real Estate</td>\n",
+       "      <td>Neutral yellow  </td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>Energy</td>\n",
+       "      <td>slight underweight orange  </td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>Materials</td>\n",
+       "      <td>slight underweight orange  </td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>Consumer  Staples</td>\n",
+       "      <td>underweight red</td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                     Sector                     CIO View.  \\\n",
+       "0                 Utilities  slight over weight green      \n",
+       "1                Financials  slight over weight green      \n",
+       "2                Healthcare  slight over weight green      \n",
+       "3   Consumer  Discretionary   Slight over weight green     \n",
+       "4   Information  Technology             Neutral yellow     \n",
+       "5   Communication  Services             Neutral yellow     \n",
+       "6               Industrials             Neutral yellow     \n",
+       "7               Real Estate             Neutral yellow     \n",
+       "8                    Energy  slight underweight orange     \n",
+       "9                 Materials  slight underweight orange     \n",
+       "10        Consumer  Staples               underweight red   \n",
+       "\n",
+       "   CIO View.Underweight CIO View.Neutral CIO View. CIO View.Overweight  \n",
+       "0                                                                    \n",
+       "1                                                                    \n",
+       "2                                                                    \n",
+       "3                                                                    \n",
+       "4                                                                    \n",
+       "5                                                                    \n",
+       "6                                                                    \n",
+       "7                                                                    \n",
+       "8                                                                    \n",
+       "9                                                                    \n",
+       "10                                                                  "
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "display(tables[6])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Coming back to MarkItDown, one interesting feature to explore is the ability to extract information from images by passing an image capable LLM model to its constructor."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 55,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "md_llm = MarkItDown(llm_client=client, llm_model=\"gpt-4o-mini\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "result = md_llm.convert(\"../data/input/forecast.png\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Here's the description we obtain from the image of our input document."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 64,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/markdown": [
+       "\n",
+       "# Description:\n",
+       "**Markets in Review: Economic Forecasts and Asset Class Weightings (as of 12/13/2024)**\n",
+       "\n",
+       "This detailed market overview presents key performance metrics and economic forecasts as of December 13, 2024.\n",
+       "\n",
+       "**Equities Overview:**\n",
+       "- **Total Returns:** Highlights returns for major indices such as the DJIA (18.4% YTD), NASDAQ (33.7% YTD), and S&P 500 (28.6% YTD), showcasing strong performance across the board.\n",
+       "- **Forecasts:** Economic indicators reveal a projected real global GDP growth of 3.1%, with inflation rates expected to stabilize around 2.2% in 2025. Unemployment rates are anticipated to remain low at 4.4%.\n",
+       "\n",
+       "**Fixed Income:**\n",
+       "- Focuses on various segments, including Corporate & Government bonds, which offer an annualized return of 4.66% and indicate shifting trends in interest rates over 2-Year (4.25%) and 10-Year (4.03%) bonds.\n",
+       "\n",
+       "**Commodities & Currencies:**\n",
+       "- Commodities such as crude oil and gold show varied performance, with oil increasing by 4.8% and gold prices sitting at $2,648.23 per ounce.\n",
+       "- Currency metrics highlight the Euro and USD trends over the past year.\n",
+       "\n",
+       "**S&P Sector Returns:**\n",
+       "- A quick reference for sector performance indicates a significant 2.5% return in Communication Services, while other sectors like Consumer Staples and Materials display minor fluctuations.\n",
+       "\n",
+       "**CIO Asset Class Weightings:**\n",
+       "- Emphasizes strategic asset allocation recommendations which are crucial for an investor's portfolio. Underweight positions in U.S. Small Cap Growth and International Developed contrast with overweight positions in certain sectors such as Utilities and Financials, signaling tactical shifts based on ongoing economic assessments.\n",
+       "\n",
+       "**Note:** This summary is sourced from BofA Global Research and aims to provide a comprehensive view of current market conditions and forecasts to assist investors in making informed decisions.\n"
+      ],
+      "text/plain": [
+       "<IPython.core.display.Markdown object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "display(Markdown(result.text_content))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "\n",
+    "Overall, the description is somewhat accurate but contains a few inaccuracies including:\n",
+    "\n",
+    "- For the sector weightings, the description states there are \"underweight positions in U.S. Small Cap Growth\" but looking at the Asset Class Weightings chart, U.S. Small Cap Growth actually shows an overweight position (green circle).\n",
+    "- The description mentions \"overweight positions in certain sectors such as Utilities and Financials\" but looking at the CIO Equity Sector Views, both these sectors show neutral positions, not overweight positions.\n",
+    "- For fixed income, the description cites a \"10-Year (4.03%)\" yield, but the image shows the 30-Year Yield at 4.03%, while the 10-Year Yield is actually 4.40%.\n",
+    "\n",
+    "Arguably, the description's inaccuracies could be a consequence of the underlying LLM model's inability to process the image. Further research is needed to determine if this is the case."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Retrieval-Augmented Generation\n",
+    "\n",
+    "RAG is a technique that allows LLMs to retrieve information from a knowledge base to answer questions. It is a popular technique for building LLM applications that require knowledge-intensive tasks {cite}`lewis2021retrievalaugmentedgenerationknowledgeintensivenlp`.\n",
+    "\n",
+    "RAG utilizes a retrieval system to fetch external knowledge and augment the LLM. It has proved effective in mitigating hallucinations of LLMs {cite}`10.1145/3589334.3645481, ni-etal-2024-llms`."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Case Studies\n",
+    "\n",
+    "This section presents three case studies that demonstrate practical solutions to common LLM limitations:\n",
+    "\n",
+    "First, Content Chunking with Contextual Linking showcases how intelligent chunking strategies can overcome both context window and output token limitations. This case study illustrates techniques for breaking down and reassembling content while maintaining coherence, enabling the generation of high-quality long-form outputs despite model constraints.\n",
+    "\n",
+    "Second, a Retrieval Augmented Generation case study addresses the challenge of stale or outdated model knowledge. By implementing semantic search over a GitHub repository, this example demonstrates how to augment LLM responses with current, accurate information - allowing users to query and receive up-to-date answers about code repository contents.\n",
+    "\n",
+    "Third, the final case study builds a Quiz generator with citations. This case study explores some additional input management techniques that become particularly useful when long context window is available. This includes implementing prompt caching for efficiency and adding citations to enhance response accuracy and verifiability. These approaches show how to maximize the benefits of larger context models while maintaining response quality."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Case Study I: Content Chunking with Contextual Linking\n",
+    "\n",
+    "Content chunking with contextual linking is a technique to break down long-form content into smaller, manageable chunks while keeping chunk-specific context. This approach tackles three problems:\n",
+    "1. The LLM's inability to process long inputs to do context-size limits\n",
+    "2. The LLM's inability to generate long-form content due to the `max_output_tokens` limitation.\n",
+    "3. The LLM's inability to maintain coherence and context when generating responses per chunks\n",
+    "\n",
+    "Here, we exemplify this technique by following these steps:\n",
+    "1. **Chunking the Content**: The input content is split into smaller chunks. This allows the LLM to process each chunk individually, focusing on generating a complete and detailed response for that specific section of the input.\n",
+    "\n",
+    "2. **Maintaining Context**: Each chunk is linked with contextual information from the previous chunks. This helps in maintaining the flow and coherence of the content across multiple chunks.\n",
+    "\n",
+    "3. **Generating Linked Prompts**: For each chunk, a prompt is generated that includes the chunk's content and its context. This prompt is then used to generate the output for that chunk.\n",
+    "\n",
+    "4. **Combining the Outputs**: The outputs of all chunks are combined to form the final long-form content.\n",
+    "\n",
+    "Let's examine an example implementation of this technique.\n",
+    "\n",
+    "#### Generating long-form content\n",
+    "\n",
+    "- Goal: Generate a long-form report analyzing a company's financial statement.\n",
+    "- Input: A company's 10K SEC filing.\n",
+    "\n",
+    "```{figure} ../_static/structured_output/diagram1.png\n",
+    "---\n",
+    "name: content-chunking-with-contextual-linking\n",
+    "alt: Content Chunking with Contextual Linking\n",
+    "scale: 50%\n",
+    "align: center\n",
+    "---\n",
+    "Content Chunking with Contextual Linking Schematic Representation.\n",
+    "```\n",
+    "\n",
+    "The diagram in {numref}`content-chunking-with-contextual-linking` illustrates the process we will follow for handling long-form content generation with Large Language Models through \"Content Chunking with Contextual Linking.\" It shows how input content is first split into manageable chunks using a chunking function (e.g. `CharacterTextSplitter` with `tiktoken` tokenizer), then each chunk is processed sequentially while maintaining context from previous chunks. For each chunk, the system updates the context, generates a dynamic prompt with specific parameters, makes a call to the LLM chain, and stores the response. After all chunks are processed, the individual responses are combined with newlines to create the final report, effectively working around the token limit constraints of LLMs while maintaining coherence across the generated content.\n",
+    "\n",
+    "**Step 1: Chunking the Content**\n",
+    "\n",
+    "There are different methods for chunking, and each of them might be appropriate for different situations. However, we can broadly group chunking strategies in two types:\n",
+    "- **Fixed-size Chunking**: This is the most common and straightforward approach to chunking. We simply decide the number of tokens in our chunk and, optionally, whether there should be any overlap between them. In general, we will want to keep some overlap between chunks to make sure that the semantic context doesn’t get lost between chunks. Fixed-sized chunking may be a reasonable path in many common cases. Compared to other forms of chunking, fixed-sized chunking is computationally cheap and simple to use since it doesn’t require the use of any specialied techniques or libraries.\n",
+    "- **Content-aware Chunking**: These are a set of methods for taking advantage of the nature of the content we’re chunking and applying more sophisticated chunking to it. Examples include:\n",
+    "  - **Sentence Splitting**: Many models are optimized for embedding sentence-level content. Naturally, we would use sentence chunking, and there are several approaches and tools available to do this, including naive splitting (e.g. splitting on periods), NLTK, and spaCy.\n",
+    "  - **Recursive Chunking**: Recursive chunking divides the input text into smaller chunks in a hierarchical and iterative manner using a set of separators.\n",
+    "  - **Semantic Chunking**: This is a class of methods that leverages embeddings to extract the semantic meaning present in your data, creating chunks that are made up of sentences that talk about the same theme or topic.\n",
+    "\n",
+    "  Here, we will utilize `langchain` for a content-aware sentence-splitting strategy for chunking. Langchain offers several text splitters {cite}`langchain_text_splitters` such as JSON-, Markdown- and HTML-based or split by token. We will use the `CharacterTextSplitter` with `tiktoken` as our tokenizer to count the number of tokens per chunk which we can use to ensure that we do not surpass the input token limit of our model.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_chunks(text: str, chunk_size: int, chunk_overlap: int) -> list:\n",
+    "    \"\"\"\n",
+    "    Split input text into chunks of specified size with specified overlap.\n",
+    "\n",
+    "    Args:\n",
+    "        text (str): The input text to be chunked.\n",
+    "        chunk_size (int): The maximum size of each chunk in tokens.\n",
+    "        chunk_overlap (int): The number of tokens to overlap between chunks.\n",
+    "\n",
+    "    Returns:\n",
+    "        list: A list of text chunks.\n",
+    "    \"\"\"\n",
+    "    from langchain_text_splitters import CharacterTextSplitter\n",
+    "\n",
+    "    text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size, chunk_overlap=chunk_overlap)\n",
+    "    return text_splitter.split_text(text)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Step 2: Writing the Base Prompt Template**\n",
+    "\n",
+    "We will write a base prompt template which will serve as a foundational structure for all chunks, ensuring consistency in the instructions and context provided to the language model. The template includes the following parameters:\n",
+    "- `role`: Defines the role or persona the model should assume.\n",
+    "- `context`: Provides the background information or context for the task.\n",
+    "- `instruction`: Specifies the task or action the model needs to perform.\n",
+    "- `input_text`: Contains the actual text input that the model will process.\n",
+    "- `requirements`: Lists any specific requirements or constraints for the output."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain_core.prompts import PromptTemplate\n",
+    "def get_base_prompt_template() -> str:\n",
+    "    \n",
+    "    base_prompt = \"\"\"\n",
+    "    ROLE: {role}\n",
+    "    CONTEXT: {context}\n",
+    "    INSTRUCTION: {instruction}\n",
+    "    INPUT: {input}\n",
+    "    REQUIREMENTS: {requirements}\n",
+    "    \"\"\"\n",
+    "    \n",
+    "    prompt = PromptTemplate.from_template(base_prompt)\n",
+    "    return prompt"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We will write a simple function that returns an `LLMChain` which is a simple `langchain` construct that allows you to chain together a combination of prompt templates, language models and output parsers."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain_core.output_parsers import StrOutputParser\n",
+    "from langchain_community.chat_models import ChatLiteLLM\n",
+    "\n",
+    "def get_llm_chain(prompt_template: str, model_name: str, temperature: float = 0):\n",
+    "    \"\"\"\n",
+    "    Returns an LLMChain instance using langchain.\n",
+    "\n",
+    "    Args:\n",
+    "        prompt_template (str): The prompt template to use.\n",
+    "        model_name (str): The name of the model to use.\n",
+    "        temperature (float): The temperature setting for the model.\n",
+    "\n",
+    "    Returns:\n",
+    "        llm_chain: An instance of the LLMChain.\n",
+    "    \"\"\"\n",
+    "    \n",
+    "    from dotenv import load_dotenv\n",
+    "    import os\n",
+    "\n",
+    "    # Load environment variables from .env file\n",
+    "    load_dotenv()\n",
+    "    \n",
+    "    api_key_label = model_name.split(\"/\")[0].upper() + \"_API_KEY\"\n",
+    "    llm = ChatLiteLLM(\n",
+    "        model=model_name,\n",
+    "        temperature=temperature,\n",
+    "        api_key=os.environ[api_key_label],\n",
+    "    )\n",
+    "    llm_chain = prompt_template | llm | StrOutputParser()\n",
+    "    return llm_chain"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Step 3: Constructing Dynamic Prompt Parameters**\n",
+    "\n",
+    "Now, we will write a function (`get_dynamic_prompt_template`) that constructs prompt parameters dynamically for each chunk."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from typing import Dict\n",
+    "def get_dynamic_prompt_params(prompt_params: Dict, \n",
+    "                            part_idx: int, \n",
+    "                            total_parts: int,\n",
+    "                            chat_context: str,\n",
+    "                            chunk: str) -> str:\n",
+    "    \"\"\"\n",
+    "    Construct prompt template dynamically per chunk while maintaining the chat context of the response generation.\n",
+    "    \n",
+    "    Args:\n",
+    "        prompt_params (Dict): Original prompt parameters\n",
+    "        part_idx (int): Index of current conversation part\n",
+    "        total_parts (int): Total number of conversation parts\n",
+    "        chat_context (str): Chat context from previous parts\n",
+    "        chunk (str): Current chunk of text to be processed\n",
+    "    Returns:\n",
+    "        str: Dynamically constructed prompt template with part-specific params\n",
+    "    \"\"\"\n",
+    "    dynamic_prompt_params = prompt_params.copy()\n",
+    "    # saves the chat context from previous parts\n",
+    "    dynamic_prompt_params[\"context\"] = chat_context\n",
+    "    # saves the current chunk of text to be processed as input\n",
+    "    dynamic_prompt_params[\"input\"] = chunk\n",
+    "    \n",
+    "    # Add part-specific instructions\n",
+    "    if part_idx == 0: # Introduction part\n",
+    "        dynamic_prompt_params[\"instruction\"] = f\"\"\"\n",
+    "        You are generating the Introduction part of a long report.\n",
+    "        Don't cover any topics yet, just define the scope of the report.\n",
+    "        \"\"\"\n",
+    "    elif part_idx == total_parts - 1: # Conclusion part\n",
+    "        dynamic_prompt_params[\"instruction\"] = f\"\"\"\n",
+    "        You are generating the last part of a long report. \n",
+    "        For this part, first discuss the below INPUT. Second, write a \"Conclusion\" section summarizing the main points discussed given in CONTEXT.\n",
+    "        \"\"\"\n",
+    "    else: # Main analysis part\n",
+    "        dynamic_prompt_params[\"instruction\"] = f\"\"\"\n",
+    "        You are generating part {part_idx+1} of {total_parts} parts of a long report.\n",
+    "        For this part, analyze the below INPUT.\n",
+    "        Organize your response in a way that is easy to read and understand either by creating new or merging with previously created structured sections given in CONTEXT.\n",
+    "        \"\"\"\n",
+    "    \n",
+    "    return dynamic_prompt_params"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "**Step 4: Generating the Report**\n",
+    "\n",
+    "Finally, we will write a function that generates the actual report by calling the `LLMChain` with the dynamically updated prompt parameters for each chunk and concatenating the results at the end."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def generate_report(input_content: str, llm_model_name: str, \n",
+    "                    role: str, requirements: str,\n",
+    "                    chunk_size: int, chunk_overlap: int) -> str:\n",
+    "    # stores the parts of the report, each generated by an individual LLM call\n",
+    "    report_parts = [] \n",
+    "    # split the input content into chunks\n",
+    "    chunks = get_chunks(input_content, chunk_size, chunk_overlap)\n",
+    "    # initialize the chat context with the input content\n",
+    "    chat_context = input_content\n",
+    "    # number of parts to be generated\n",
+    "    num_parts = len(chunks)\n",
+    "\n",
+    "    prompt_params = {\n",
+    "        \"role\": role, # user-provided\n",
+    "        \"context\": \"\", # dinamically updated per part\n",
+    "        \"instruction\": \"\", # dynamically updated per part\n",
+    "        \"input\": \"\", # dynamically updated per part\n",
+    "        \"requirements\": requirements #user-priovided\n",
+    "    }\n",
+    "\n",
+    "    # get the LLMChain with the base prompt template\n",
+    "    llm_chain = get_llm_chain(get_base_prompt_template(), \n",
+    "                                 llm_model_name)\n",
+    "\n",
+    "    # dynamically update prompt_params per part\n",
+    "    print(f\"Generating {num_parts} report parts\")\n",
+    "    for i, chunk in enumerate(chunks):\n",
+    "        dynamic_prompt_params = get_dynamic_prompt_params(\n",
+    "            prompt_params,\n",
+    "            part_idx=i,\n",
+    "            total_parts=num_parts,\n",
+    "            chat_context=chat_context,\n",
+    "            chunk=chunk\n",
+    "        )\n",
+    "        \n",
+    "        # invoke the LLMChain with the dynamically updated prompt parameters\n",
+    "        response = llm_chain.invoke(dynamic_prompt_params)\n",
+    "\n",
+    "        # update the chat context with the cummulative response\n",
+    "        if i == 0:\n",
+    "            chat_context = response\n",
+    "        else:\n",
+    "            chat_context = chat_context + response\n",
+    "            \n",
+    "        print(f\"Generated part {i+1}/{num_parts}.\")\n",
+    "        report_parts.append(response)\n",
+    "\n",
+    "    report = \"\\n\".join(report_parts)\n",
+    "    return report"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Example Usage**\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load the text from sample 10K SEC filing\n",
+    "with open('../data/apple.txt', 'r') as file:\n",
+    "    text = file.read()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Define the chunk and chunk overlap size\n",
+    "MAX_CHUNK_SIZE = 10000\n",
+    "MAX_CHUNK_OVERLAP = 0"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "report = generate_report(text, llm_model_name=\"gemini/gemini-1.5-flash-latest\", \n",
+    "                           role=\"Financial Analyst\", \n",
+    "                           requirements=\"The report should be in a readable, structured format, easy to understand and follow. Focus on finding risk factors and market moving insights.\",\n",
+    "                           chunk_size=MAX_CHUNK_SIZE, \n",
+    "                           chunk_overlap=MAX_CHUNK_OVERLAP)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Save the generated report to a local file\n",
+    "with open('data/apple_report.txt', 'w') as file:\n",
+    "    file.write(report)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 109,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/markdown": [
+       "**Introduction**\n",
+       "\n",
+       "This report provides a comprehensive analysis of Apple Inc.'s financial performance and position for the fiscal year ended September 28, 2024, as disclosed in its Form 10-K filing with the United States Securities and Exchange Commission.  The analysis will focus on identifying key risk factors impacting Apple's business, evaluating its financial health, and uncovering market-moving insights derived from the provided data.  The report will delve into Apple's various segments, product lines, and services, examining their performance and contributions to overall financial results.  Specific attention will be paid to identifying trends, potential challenges, and opportunities for future growth.  The analysis will also consider the broader macroeconomic environment and its influence on Apple's operations and financial outlook.  Finally, the report will incorporate relevant information from Apple's definitive proxy statement for its 2025 annual meeting of shareholders, as incorporated by reference in the Form 10-K.\n",
+       "\n",
+       "**PART 2: Key Risk Factors and Market-Moving Insights**\n",
+       "\n",
+       "This section analyzes key risk factors disclosed in Apple Inc.'s 2024 Form 10-K, focusing on their potential impact on financial performance and identifying potential market-moving insights.  The analysis is structured around the major risk categories identified in the filing.\n",
+       "\n",
+       "**2.1 Dependence on Third-Party Developers:**\n",
+       "\n",
+       "Apple's success is heavily reliant on the continued support and innovation of third-party software developers.  The Form 10-K highlights several critical aspects of this dependence:\n",
+       "\n",
+       "* **Market Share Vulnerability:** Apple's relatively smaller market share in smartphones, personal computers, and tablets compared to competitors (Android, Windows, gaming consoles) could discourage developers from prioritizing Apple's platform, leading to fewer high-quality apps and potentially impacting customer purchasing decisions.  This is a significant risk, especially given the rapid pace of technological change.  A decline in app availability or quality could negatively impact sales and market share.  **Market-moving insight:**  Monitoring developer activity and app quality across competing platforms is crucial for assessing this risk.  Any significant shift in developer focus away from iOS could be a negative market signal.\n",
+       "\n",
+       "* **App Store Dynamics:** While Apple allows developers to retain most App Store revenue, its commission structure and recent changes (e.g., complying with the Digital Markets Act (DMA) in the EU) introduce uncertainty.  Changes to the App Store's policies or fee structures could materially affect Apple's revenue and profitability.  **Market-moving insight:**  Closely monitoring regulatory developments (especially concerning the DMA) and their impact on App Store revenue is essential.  Any significant changes to Apple's App Store policies or revenue streams could trigger market reactions.\n",
+       "\n",
+       "* **Content Acquisition and Creation:** Apple's reliance on third-party digital content providers for its services introduces risks related to licensing agreements, competition, and pricing.  The cost of producing its own digital content is also increasing due to competition for talent and subscribers.  Failure to secure or create appealing content could negatively impact user engagement and revenue.  **Market-moving insight:**  Analyzing the success of Apple's original content initiatives and the renewal rates of third-party content agreements will provide insights into this risk.\n",
+       "\n",
+       "**2.2 Operational Risks:**\n",
+       "\n",
+       "\n",
+       " (...) \n",
+       "\n",
+       " The reconciliation of segment operating income to consolidated operating income reveals that research and development (R&D) and other corporate expenses significantly impact overall profitability.  While increased R&D is generally positive, it reduces short-term profits.  The geographical breakdown of net sales and long-lived assets further emphasizes the concentration of Apple's business in the U.S. and China.  **Market-moving insight:**  Continued weakness in the Greater China market, sustained flat iPhone sales, or any significant changes in R&D spending should be closely monitored for their potential impact on Apple's financial performance and investor sentiment.\n",
+       "\n",
+       "\n",
+       "**5.4 Auditor's Report and Internal Controls:**\n",
+       "\n",
+       "The auditor's report expresses an unqualified opinion on Apple's financial statements and internal control over financial reporting.  However, it identifies uncertain tax positions as a critical audit matter.  The significant amount of unrecognized tax benefits ($22.0 billion) and the complexity involved in evaluating these positions highlight a substantial risk.  Management's assessment of these positions involves significant judgment and relies on interpretations of complex tax laws.  Apple's management also asserts that its disclosure controls and procedures are effective.  **Market-moving insight:**  Any changes in tax laws, unfavorable rulings on uncertain tax positions, or weaknesses in internal controls could materially affect Apple's financial results and investor confidence.\n",
+       "\n",
+       "\n",
+       "**Conclusion**\n",
+       "\n",
+       "This report provides a comprehensive analysis of Apple Inc.'s financial performance and position for fiscal year 2024.  While Apple maintains a strong financial position with substantial cash reserves and a robust capital return program, several key risk factors could significantly impact its future performance.  These risks include:\n",
+       "\n",
+       "* **Dependence on third-party developers:**  A shift in developer focus away from iOS or changes to the App Store's policies could negatively impact Apple's revenue and profitability.\n",
+       "* **Operational risks:**  Employee retention challenges, reseller dependence, and cybersecurity threats pose significant operational risks.\n",
+       "* **Legal and regulatory risks:**  Ongoing antitrust litigation, the Digital Markets Act (DMA) compliance, and data privacy regulations introduce substantial legal and regulatory uncertainties.\n",
+       "* **Financial risks:**  Volatility in sales and profit margins, foreign exchange rate fluctuations, credit risk, and tax risks could impact Apple's financial performance.\n",
+       "* **Supply chain concentration:**  Apple's reliance on a concentrated network of outsourcing partners, primarily located in a few Asian countries, and dependence on single or limited sources for certain custom components, exposes the company to significant supply chain risks.\n",
+       "* **Uncertain tax positions:**  The significant amount of unrecognized tax benefits represents a substantial uncertainty that could materially affect Apple's financial results.\n",
+       "\n",
+       "Despite these risks, Apple's strong liquidity position, continued growth in its Services segment, and robust capital return program provide a degree of resilience.  However, investors and analysts should closely monitor the market-moving insights identified throughout this report, including developer activity, regulatory developments, regional economic conditions, supply chain stability, and the resolution of uncertain tax positions, to assess their potential impact on Apple's future performance and valuation.  The significant short-term obligations, while manageable given Apple's cash position, highlight the need for continued financial discipline and effective risk management.  A deeper, more granular analysis of the financial statements and notes is recommended for a more complete assessment."
+      ],
+      "text/plain": [
+       "<IPython.core.display.Markdown object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "# Read and display the generated report\n",
+    "with open('../data/apple_report.txt', 'r') as file:\n",
+    "    report_content = file.read()\n",
+    "    \n",
+    "from IPython.display import Markdown\n",
+    "\n",
+    "# Display first and last 10% of the report content\n",
+    "report_lines = report_content.splitlines()\n",
+    "total_lines = len(report_lines)\n",
+    "quarter_lines = total_lines // 10\n",
+    "\n",
+    "top_portion = '\\n'.join(report_lines[:quarter_lines])\n",
+    "bottom_portion = '\\n'.join(report_lines[-quarter_lines:])\n",
+    "\n",
+    "display(Markdown(f\"{top_portion}\\n\\n (...) \\n\\n {bottom_portion}\"))\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "---"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Discussion\n",
+    "\n",
+    "Results from the generated report present a few interesting aspects:\n",
+    "\n",
+    "- **Coherence**: The generated report demonstrates an apparent level of coherence. The sections are logically structured, and the flow of information is smooth. Each part of the report builds upon the previous sections, providing a comprehensive analysis of Apple Inc.'s financial performance and key risk factors. The use of headings and subheadings helps in maintaining clarity and organization throughout the document.\n",
+    "\n",
+    "- **Adherence to Instructions**: The LLM followed the provided instructions effectively. The report is in a readable, structured format, and it focuses on identifying risk factors and market-moving insights as requested. The analysis is detailed and covers various aspects of Apple's financial performance, including revenue segmentation, profitability, liquidity, and capital resources. The inclusion of market-moving insights adds value to the report, aligning with the specified requirements.\n",
+    "\n",
+    "Despite the seemingly good quality of the results, there are some limitations to consider:\n",
+    "\n",
+    "- **Depth of Analysis**: While the report covers a wide range of topics, the depth of analysis in certain sections may not be as comprehensive as a human expert's evaluation. Some nuances and contextual factors might be overlooked by the LLM. Splitting the report into multiple parts helps in mitigating this issue.\n",
+    "\n",
+    "- **Chunking Strategy**: The current approach splits the text into chunks based on size, which ensures that each chunk fits within the model's token limit. However, this method may disrupt the logical flow of the document, as sections of interest might be split across multiple chunks. An alternative approach could be \"structured\" chunking, where the text is divided based on meaningful sections or topics. This would preserve the coherence of each section, making it easier to follow and understand. Implementing structured chunking requires additional preprocessing to identify and segment the text appropriately, but it can significantly enhance the readability and logical flow of the generated report.\n",
+    "\n",
+    "Here, we implemented a simple strategy to improve the coherence in output generation given a multi-part chunked input. Many other strategies are possible. One related technique worth mentioning is Anthropic's Contextual Retrieval {cite}`anthropic2024contextualretrieval`. The approach, as shown in {numref}`anth_contextual`, employs an LLM itself to generate relevant context per chunk before passing these two pieces of information together to the LLM. This process was proposed in the context of RAGs to enhance its retrieval capabilities but can be applied more generally to improve output generation.\n",
+    "```{figure} ../_static/input/anth_contextual.png\n",
+    "---\n",
+    "name: anth_contextual\n",
+    "alt: Anthropic Contextual Linking\n",
+    "scale: 50%\n",
+    "align: center\n",
+    "---\n",
+    "Anthropic Contextual Linking {cite}`anthropic2024contextualretrieval`.\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Case Study II: Github RAG\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Case Study III: Quiz Generation with Citations\n",
+    "\n",
+    "In this case study, we will build a Quiz generator with citations that explores additional input management techniques particularly useful with long context windows. The implementation includes prompt caching for efficiency and citation tracking to enhance accuracy and verifiability. We will use Gemini 1.5 Pro as our LLM model, which has a context window of 2M tokens.\n",
+    "\n",
+    "#### Use Case\n",
+    "\n",
+    "Let's assume you are a Harvard student enrolled in GOV 1039 \"The Birth of Modern Democracy\" (see {numref}`harvard-class`), you face a daunting reading list for next Tuesday's class on Rights. The readings include foundational documents like the Magna Carta, Declaration of Independence, and US Bill of Rights, each with specific sections to analyze.\n",
+    "\n",
+    "```{figure} ../_static/input/harvard.png\n",
+    "---\n",
+    "name: harvard-class\n",
+    "alt: Harvard Class\n",
+    "scale: 50%\n",
+    "align: center\n",
+    "---\n",
+    "Harvard's Democratic Theory Class\n",
+    "```\n",
+    "\n",
+    "Instead of trudging through these dense historical texts sequentially, we would like to:\n",
+    "- Extract key insights and connections between these documents, conversationally.\n",
+    "- Engage with the material through a quiz format.\n",
+    "- Add citations to help with verifying answers.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Implementation\n",
+    "\n",
+    "The full implementation is available at Book's [Github repository](https://github.com/souzatharsis/tamingLLMs/tamingllms/notebooks/src/gemini_duo.py). Here, we will cover the most relevant parts of the implementation.\n",
+    "\n",
+    "**Client Class**\n",
+    "\n",
+    "First, we will define the `Client` class which will provide the key interface users will interact with. It has the following summarized interface:\n",
+    "\n",
+    "- Initialization:\n",
+    "    - `__init__(knowledge_base: List[str] = [])`: Initialize with optional list of URLs as knowledge base\n",
+    "\n",
+    "- Core Methods:\n",
+    "    - `add_knowledge_base(urls: List[str]) -> None`: Add URLs to the knowledge base\n",
+    "    - `add(urls: List[str]) -> None`: Extract content from URLs and add to conversation input\n",
+    "    - `msg(msg: str = \"\", add_citations: bool = False) -> str`: Enables users to send messages to the client\n",
+    "    - `quiz(add_citations: bool = True, num_questions: int = 10) -> str`: Generate a quiz based on full input memory\n",
+    "\n",
+    "- Key Attributes:\n",
+    "    - `knowledge_base`: List of URLs providing foundation knowledge\n",
+    "    - `input`: Current input being studied (short-term memory)\n",
+    "    - `input_memory`: Cumulative input + knowledge base (long-term memory) \n",
+    "    - `response`: Latest response from LLM\n",
+    "    - `response_memory`: Cumulative responses (long-term memory)\n",
+    "    - `urls_memory`: Cumulative list of processed URLs\n",
+    "\n",
+    "\n",
+    "**Corpus-in-Context Prompting**\n",
+    "\n",
+    "The `add()` method is key since it is used to add content to the client. It takes a list of URLs and extracts the content from each URL using a content extractor (using MarkitDown). The content is then added to the conversation input memory in a way that enables citations using the \"Corpus-in-Context\" (CIC) Prompting {cite}`lee2024longcontextlanguagemodelssubsume`.\n",
+    "\n",
+    "{numref}`cic` shows how CIC format is used to enable citations. It inserts a corpus into the prompt. Each candidate citable part (e.g., passage, chapter) in a corpus is assigned a unique identifier (ID) that can be referenced as needed for that task.\n",
+    "\n",
+    "```{figure} ../_static/input/cic.png\n",
+    "---\n",
+    "name: cic\n",
+    "alt: CIC Format\n",
+    "scale: 50%\n",
+    "align: center\n",
+    "---\n",
+    "Example of Corpus-in-Context Prompting for retrieval. \n",
+    "```\n",
+    "\n",
+    "CiC prompting leverages LLM's capacity to follow instructions by carefully annotating the corpus with document IDs. It benefits from a strong, capable models to retrieve over large corpora provided in context. \n",
+    "\n",
+    "```python\n",
+    "    def add(self, urls: List[str]) -> None:\n",
+    "        self.urls = urls\n",
+    "\n",
+    "        # Add new content to input following CIC format to enable citations\n",
+    "        for url in urls:\n",
+    "            self.urls_memory.append(url)\n",
+    "            content = self.extractor.convert(url).text_content\n",
+    "            formatted_content = f\"ID: {self.reference_id} | {content} | END ID: {self.reference_id}\"\n",
+    "            self.input += formatted_content + \"\\n\" \n",
+    "            self.reference_id += 1\n",
+    "        \n",
+    "        # Update memory\n",
+    "        self.input_memory = self.input_memory + self.input\n",
+    "```\n",
+    "\n",
+    "The method `add_knowledge_base()` is a simple wrapper around the `add()` method. It is used to add URLs to the knowledge base, which are later cached by the LLM model as we will see later.\n",
+    "\n",
+    "```python\n",
+    "    def add_knowledge_base(self, urls: List[str]) -> None:\n",
+    "        self.add(urls)\n",
+    "```\n",
+    "\n",
+    "\n",
+    "Later, when the user sends a message to the client, the `msg()` method is used to generate a response  while enabling citations. `self.content_generator` is an instance of our LLM model, which we will go through next.\n",
+    "\n",
+    "```python\n",
+    "    def msg(self, msg: str = \"\", add_citations: bool = False) -> str:\n",
+    "        if add_citations:\n",
+    "            msg = msg + \"\\n\\n For key statements, add Input ID to the response.\"\n",
+    "\n",
+    "        self.response = self.content_generator.generate(\n",
+    "            input_content=self.input,\n",
+    "            user_instructions=msg\n",
+    "        )\n",
+    "\n",
+    "        self.response_memory = self.response_memory + self.response.text\n",
+    "\n",
+    "        return self.response.text\n",
+    "```\n",
+    "\n",
+    "**Prompt Caching**\n",
+    "\n",
+    "LLM-based applications often involve repeatedly passing the same input tokens to a model, which can be inefficient and costly. Context caching addresses this by allowing you to cache input tokens after their first use and reference them in subsequent requests. This approach significantly reduces costs compared to repeatedly sending the same token corpus, especially at scale.\n",
+    "\n",
+    "In our application, the user might passes a large knowledge base to the client that can be referenced multiple times by smaller user requests. Our `Client` class is composed of a `LLMBackend` class that takes the `input_memory` containing the entire knowledge base and any additional user added content.\n",
+    "```python\n",
+    "self.llm = LLMBackend(input=self.input_memory)\n",
+    "```\n",
+    "\n",
+    "In our `LLMBackend` Class, we leverage prompt caching on input tokens and uses them for subsequent requests.\n",
+    "\n",
+    "```python\n",
+    "class LLMBackend:\n",
+    "    def __init__(self, model_name: str, input: str, cache_ttl: int = 60):\n",
+    "        self.cache = caching.CachedContent.create(\n",
+    "            model=model_name,\n",
+    "            display_name='due_knowledge_base', # used to identify the cache\n",
+    "            system_instruction=(\n",
+    "            self.compose_prompt(input, conversation_config)\n",
+    "        ),\n",
+    "        ttl=datetime.timedelta(minutes=cache_ttl),\n",
+    "    )\n",
+    "\n",
+    "    self.model = genai.GenerativeModel.from_cached_content(cached_content=self.cache)\n",
+    "```\n",
+    "\n",
+    "**Quiz Generation**\n",
+    "\n",
+    "Coming back to our `Client` class, we implement the `quiz()` method to generate a quiz based on the full input memory, i.e. the initial knowledge base and any additional user added content.\n",
+    "\n",
+    "The `quiz()` method returns a `Quiz` instance which behind the scenes caches input tokens. The user later can invoke its `generate()` method to generate a quiz passing the user instructions in `msg` parameter, as we will see later.\n",
+    "\n",
+    "```python\n",
+    "    def quiz(self, add_citations: bool = True, num_questions: int = 10) -> str:\n",
+    "        \"\"\"\n",
+    "        Returns a quiz instance based on full input memory.\n",
+    "        \"\"\"\n",
+    "        self.quiz_instance = Quiz(\n",
+    "                         input=self.input_memory,\n",
+    "                         add_citations=add_citations,\n",
+    "                         num_questions=num_questions)\n",
+    "        return self.quiz_instance\n",
+    "```\n",
+    "\n",
+    "We write a simple prompt template for quiz generation:\n",
+    "\n",
+    "> ROLE:\n",
+    "> - You are a Harvard Professor providing a quiz.\n",
+    "> INSTRUCTIONS:\n",
+    "> - Generate a quiz with {num_questions} questions based on the input.\n",
+    "> - The quiz should be multi-choice.\n",
+    "> - Answers should be provided at the end of the quiz.\n",
+    "> - Questions should have broad coverage of the input including multiple Input IDs.\n",
+    "> - Level of difficulty is advanced/hard.\n",
+    "> - {{citations}}\n",
+    ">\n",
+    "> STRUCTURE:\n",
+    "> - Sequence of questions and alternatives.\n",
+    "> - At the end provide the correct answers.\n",
+    "\n",
+    "where, `{citations}` instructs the model to add CiC citations to the response if user requests it."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Example Usage\n",
+    "\n",
+    "\n",
+    "**Dataset**\n",
+    "\n",
+    "First, we will define our knowledge base. \n",
+    "\n",
+    "- Harvard Class: [GOV 1039 Syllabus](https://scholar.harvard.edu/files/dlcammack/files/gov_1039_syllabus.pdf)\n",
+    "- Class / Topic: \"Rights\"\n",
+    "- Reading List:\n",
+    "    - ID 1. The Declaration of Independence of the United States of America\n",
+    "    - ID 2. The United States Bill of Rights\n",
+    "    - ID 3. John F. Kennedy's Inaugural Address\n",
+    "    - ID 4. Lincoln's Gettysburg Address\n",
+    "    - ID 5. The United States Constitution\n",
+    "    - ID 6. Give Me Liberty or Give Me Death\n",
+    "    - ID 7. The Mayflower Compact\n",
+    "    - ID 8. Abraham Lincoln's Second Inaugural Address\n",
+    "    - ID 9. Abraham Lincoln's First Inaugural Address\n",
+    "\n",
+    "We will take advantage of Project Gutenberg's to create our knowledge base."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "kb = [f\"https://www.gutenberg.org/cache/epub/{i}/pg{i}.txt\" for i in range(1,9)]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We will import our module `gemini_duo` as `genai_duo` and initialize the `Client` class with our knowledge base."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import gemini_duo as genai_duo\n",
+    "from IPython.display import Markdown, display"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "duo = genai_duo.Client(knowledge_base=kb)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "At this point, we converted each book into markdown using MarkitDown and cached the content in our LLM model. We can access how many tokens we have cached in our LLM model by looking at the `usage_metadata` attribute of the Gemini's model response. At this point, we have cached at total of 38470 tokens.\n",
+    "\n",
+    "Now, we can add references to our knowledge base at anytime by calling the `add()` method. We add the following references:\n",
+    "1. The Magna Carta\n",
+    "2. William Shap McKechnie on Magna Carta book"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "study_references = [\"https://www.gutenberg.org/cache/epub/10000/pg10000.txt\", \"https://www.gutenberg.org/cache/epub/65363/pg65363.txt\"]\n",
+    "\n",
+    "duo.add(study_references)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now we can instantiate a `Quiz` object and generate a quiz based on the full input memory."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "quiz = duo.quiz(add_citations=True)\n",
+    "display(Markdown(quiz.generate()))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "{numref}`quiz` shows a sample quiz with citations. Marked in yellow are the citations which refer to the input IDs of the resources we added to the model.\n",
+    "\n",
+    "```{figure} ../_static/input/quiz.png\n",
+    "---\n",
+    "name: quiz\n",
+    "alt: Quiz with Citations\n",
+    "scale: 50%\n",
+    "align: center\n",
+    "---\n",
+    "Sample Quiz with Citations.\n",
+    "```\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Discussion\n",
+    "\n",
+    "The experiment demonstrated the ability to build a knowledge base from multiple sources while leveraging prompt caching for efficiency and generate quizzes with citations for verifiability. The system successfully ingested content from Project Gutenberg texts, including historical documents like the Magna Carta, and used them to create interactive educational content.\n",
+    "\n",
+    "However, several limitations emerged during this process:\n",
+    "\n",
+    "1. Memory Management: The system currently loads all content into memory, which could become problematic with larger knowledge bases. A more scalable approach might involve chunking or streaming the content.\n",
+    "\n",
+    "2. Citation Quality: While the system provides citations, they lack specificity - pointing to entire documents rather than specific passages or page numbers. This limits the ability to fact-check or verify specific claims.\n",
+    "\n",
+    "3. Content Verification: While citations are provided, the system is not guaranteed to provide factual information. This could lead to potential hallucinations or misinterpretations.\n",
+    "\n",
+    "While limitations are present in this simple example, the case study highlights that not always complex systems are needed. Alternative simple strategies should be preferred when possible, particularly if capable, long-context window models are available and fit within the application requirements.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Conclusion"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "[![CC BY-NC-SA 4.0][cc-by-nc-sa-image]][cc-by-nc-sa]\n",
+    "\n",
+    "[cc-by-nc-sa]: http://creativecommons.org/licenses/by-nc-sa/4.0/\n",
+    "[cc-by-nc-sa-image]: https://licensebuttons.net/l/by-nc-sa/4.0/88x31.png\n",
+    "[cc-by-nc-sa-shield]: https://img.shields.io/badge/License-CC-BY--NC--SA-4.0-lightgrey.svg\n",
+    "\n",
+    "```\n",
+    "@misc{tharsistpsouza2024tamingllms,\n",
+    "  author = {Tharsis T. P. Souza},\n",
+    "  title = {Taming LLMs: A Practical Guide to LLM Pitfalls with Open Source Software},\n",
+    "  year = {2024},\n",
+    "  chapter = {Managing Input Data},\n",
+    "  journal = {GitHub repository},\n",
+    "  url = {https://github.com/souzatharsis/tamingLLMs)\n",
+    "}\n",
+    "```\n",
+    "## References\n",
+    "```{bibliography}\n",
+    ":filter: docname in docnames\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
\ No newline at end of file
diff --git a/tamingllms/_build/jupyter_execute/notebooks/local.ipynb b/tamingllms/_build/jupyter_execute/notebooks/local.ipynb
index 1c42990..a95dcb4 100644
--- a/tamingllms/_build/jupyter_execute/notebooks/local.ipynb
+++ b/tamingllms/_build/jupyter_execute/notebooks/local.ipynb
@@ -181,11 +181,11 @@
     "Performance Comparison including proprietary models.\n",
     "```\n",
     "\n",
-    "Also from China, DeepSeek-V3 {cite}`deepseek2024v3` represents a major breakthrough in open source language models, emerging as arguably as the most capable open source large language model available today. With 671 billion parameters and 37 billion active MoE (Mixture of Experts) parameters, it achieves performance on par with leading proprietary models like Claude 3.5 Sonnet and GPT 4o as shown in {numref}`deep`. The model demonstrates impressive efficiency metrics (see {numref}`deep2`), processing input tokens at $0.27 per million and output tokens at $1.1 per million, while maintaining a generation speed of 60 tokens per second (3x faster than DeepSeek-V2).\n",
+    "Also from China, DeepSeek-V3 {cite}`deepseek2024v3` represents a major breakthrough in open source language models, emerging as arguably the most capable open source large language model available as of the end of 2024. With 671 billion parameters and 37 billion active MoE (Mixture of Experts) parameters, it achieves performance on par with leading proprietary models like Claude 3.5 Sonnet and GPT 4o as shown in {numref}`deep`. The model demonstrates impressive cost efficiency metrics (see {numref}`deep2`), processing input tokens at $0.27 per million and output tokens at $1.1 per million, while maintaining a generation speed of 60 tokens per second (3x faster than DeepSeek-V2).\n",
     "\n",
-    "What makes DeepSeek-V3 particularly remarkable is that these capabilities were achieved with a relatively modest training budget of just $5.5 million, used to train on 14.8 trillion tokens. This efficiency in training demonstrates the potential for open source models to compete with proprietary alternatives at a fraction of the cost. The model's release marks a significant milestone in the democratization of advanced AI capabilities, challenging the dominance of proprietary models.\n",
+    "What makes DeepSeek-V3 particularly remarkable is that these capabilities were achieved with a relatively modest training budget of just $5.5 million, used to train on 14.8 trillion tokens. This efficiency in training demonstrates the potential for open source models to compete with proprietary alternatives at a fraction of the cost. The model's release marks a significant milestone in the democratization of advanced AI capabilities, challenging the dominance of proprietary models within big tech. One should be cautious though as the model has not yet been battle-tested in the wild but this is an exciting development demonstrating the potential of open source models to compete with proprietary alternatives.\n",
     "\n",
-    "```{figure} ../_static/local/deep.png\n",
+    "```{figure} ../_static/local/deep.jpeg\n",
     "---\n",
     "name: deep\n",
     "alt: DeepSeek-V3\n",
@@ -195,7 +195,7 @@
     "DeepSeek-V3 Performance Comparison\n",
     "```\n",
     "\n",
-    "```{figure} ../_static/local/deep2.png\n",
+    "```{figure} ../_static/local/deep2.jpeg\n",
     "---\n",
     "name: deep2\n",
     "alt: DeepSeek-V3 Cost Benefit Analysis\n",
diff --git a/tamingllms/_static/input/anth_contextual.png b/tamingllms/_static/input/anth_contextual.png
new file mode 100644
index 0000000..c8401c0
Binary files /dev/null and b/tamingllms/_static/input/anth_contextual.png differ
diff --git a/tamingllms/_static/input/asset_class.png b/tamingllms/_static/input/asset_class.png
new file mode 100644
index 0000000..237d081
Binary files /dev/null and b/tamingllms/_static/input/asset_class.png differ
diff --git a/tamingllms/_static/input/docling.png b/tamingllms/_static/input/docling.png
new file mode 100644
index 0000000..143ded9
Binary files /dev/null and b/tamingllms/_static/input/docling.png differ
diff --git a/tamingllms/_static/input/markitdown.png b/tamingllms/_static/input/markitdown.png
new file mode 100644
index 0000000..282503c
Binary files /dev/null and b/tamingllms/_static/input/markitdown.png differ
diff --git a/tamingllms/_toc.yml b/tamingllms/_toc.yml
index fa8fb2f..74069fe 100644
--- a/tamingllms/_toc.yml
+++ b/tamingllms/_toc.yml
@@ -10,7 +10,7 @@ chapters:
 - file: markdown/intro.md
 - file: notebooks/evals.ipynb
 - file: notebooks/structured_output.ipynb
-#- file: notebooks/input.ipynb
+- file: notebooks/input.ipynb
 - file: notebooks/safety.ipynb
 - file: notebooks/alignment.ipynb
 - file: notebooks/local.ipynb
diff --git a/tamingllms/notebooks/input.ipynb b/tamingllms/notebooks/input.ipynb
index c323ec5..d05669d 100644
--- a/tamingllms/notebooks/input.ipynb
+++ b/tamingllms/notebooks/input.ipynb
@@ -21,10 +21,22 @@
    "source": [
     "## Introduction\n",
     "\n",
+    "Large Language Models face several critical challenges in effectively processing input data. While advances in long-context language models (LCLMs) {cite}`lee2024longcontextlanguagemodelssubsume` have expanded the amount of information these systems can process simultaneously, significant challenges remain in managing and effectively utilizing extended inputs. \n",
     "\n",
+    "LLMs are sensitive to input formatting and structure, requiring careful data preparation to achieve optimal results {cite}`tan2024htmlraghtmlbetterplain`. They operate with knowledge cutoffs, providing potentially stale or outdated information that may not reflect current reality and demonstrate problems with temporal knowledge accuracy {cite}`amayuelas-etal-2024-knowledge`. LLMs also struggle with less common but important information showing a systematic loss of long-tail knowledge {cite}`kotha2024understanding`.\n",
     "\n",
+    "Motivated by these challenges, this chapter explores two key components:\n",
     "\n",
-    "When building applications with language models, developers often default to complex architectures involving retrieval systems, chunking strategies, and sophisticated pipelines. However, these approaches add unnecessary complexity when simpler solutions exist. This is where long-context language models (LCLMs) {cite}`lee2024longcontextlanguagemodelssubsume` come in. LCLMs are a new class of models that can process massive amounts of text - up to millions of tokens - in a single forward pass. This capability means they can directly ingest and reason about entire documents or datasets without requiring external tools or complex preprocessing steps. The implications are significant: developers can build more maintainable systems by simply feeding raw text to the model rather than orchestrating complicated retrieval and chunking pipelines. Recent benchmarks have shown that this straightforward approach can match or exceed the performance of more complex systems like RAG, despite never being explicitly trained for such tasks. Before implementing sophisticated architectures, developers should first evaluate whether an LCLM's native capabilities might offer a simpler path to their goals."
+    "1. Data Parsing: Parsing documents into a unified format that is suitable for LLMs to process.\n",
+    "2. Retrieval Augmentation: Augmenting LLMs with the ability to retrieve relevant, recent, and specialized information.\n",
+    "\n",
+    "In data parsing, we will explore some useful open source tools that help transform data into LLM-compatible formats, demonstrating their impact through a case study of structured information extraction from complex PDFs. In a second case study, we will introduce some chunking strategies to help LLMs process long inputs and implement a particular technique called Chunking with Contextual Linking the enables contextually relevant chunk processing.\n",
+    "\n",
+    "In retrieval augmentation, we will explore how to enhance LLMs with semantic search capabilities for incorporating external context using RAGs (Retrieval Augmented Generation). Through a detailed case study, we build a RAG system for querying live codebases, illustrating methods to bridge static model knowledge with dynamic information requirements.\n",
+    "\n",
+    "In our last case study, we build a quiz generator using a LLM with large context window. We will explore some additional relevant techniques such as prompt caching and response verification through citations.\n",
+    "\n",
+    "By the chapter's conclusion, readers will possess relevant knowledge of input data management strategies for LLMs and practical expertise in selecting and implementing appropriate approaches and tools for specific use cases."
    ]
   },
   {
@@ -33,9 +45,9 @@
    "source": [
     "## Parsing Documents\n",
     "\n",
-    "When discussing document processing with LLMs, there's often a focus on sophisticated algorithms from chunking to contextual inferencing to RAGs. However, this misses the core challenge in production systems, which is 80% about cleaning and normalizing the input, and 20% about actually algorithmic inferencing.\n",
+    "Building robust data ingestion and preprocessing pipelines is essential for any LLM application. This section explores tools and frameworks that streamline input data processing, in particular for parsing purposes, providing a unified interface for converting diverse data formats into standardized representations that LLMs can effectively process. By abstracting away format-specific complexities, they allow developers to focus on core application logic rather than parsing implementation details while maximizing the performance of the LLM.\n",
     "\n",
-    "Building robust data ingestion and preprocessing pipelines is essential for any LLM application. This section explores powerful tools and frameworks like MarkItDown, Docling, and LangChain that streamline document processing. These tools provide unified interfaces for converting diverse document formats into standardized representations that LLMs can effectively process. By abstracting away format-specific complexities, they allow developers to focus on core application logic rather than parsing implementation details.\n"
+    "We will cover open source tools and frameworks that provide parsing capabilities for a wide range of data formats. And we will demonstrate how some of these tools can be used to extract structured information from complex PDFs also discussing how the quality of the parser can impact LLM's performance."
    ]
   },
   {
@@ -44,7 +56,7 @@
    "source": [
     "### MarkItDown\n",
     "\n",
-    "MarkItDown is a Python package and CLI too developed by the Microsoft AutoGen team for converting various file formats to Markdown. It supports a wide range of formats including PDF, PowerPoint, Word, Excel, images (with OCR and EXIF metadata), audio (with transcription), HTML, and other text-based formats. The tool is particularly useful for document indexing and text analysis tasks.\n",
+    "MarkItDown is a Python package and CLI too developed by the Microsoft AutoGen team for converting various file formats to Markdown. It supports a wide range of formats including PDF, PowerPoint, Word, Excel, images (with OCR and EXIF metadata), audio (with transcription), HTML, and other text-based formats making it a useful tool for document indexing and LLM-based applications.\n",
     "\n",
     "Key features:\n",
     "- Simple command-line and Python API interfaces\n",
@@ -95,14 +107,14 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Case Study: Structured Data Extraction"
+    "### Structured Data Extraction"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "A common use case where document parsing matters is to extract structured data from documents, particularly in the presence of complex formatting and layout. In this case study, we will extract the economic forecasts from Merrill Lynch's CIO Capital Market Outlook released on December 16, 2024 {cite:p}`merrill2024`.  {numref}`forecast` shows page 7 of the mentioned document, which contains several economic variables. \n",
+    "A common use case where document parsing matters is to structured data extraction from documents, particularly in the presence of complex formatting and layout. In this case study, we will extract the economic forecasts from Merrill Lynch's CIO Capital Market Outlook released on December 16, 2024 {cite:p}`merrill2024`. We will focus on page 7 of this document, which contains several economic variables organized in a mix of tables, text and images (see {numref}`forecast`)\n",
     "\n",
     "\n",
     "```{figure} ../data/input/forecast.png\n",
@@ -116,13 +128,6 @@
     "```"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "We will focus on the page containing the economic forecasts."
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": 76,
@@ -265,141 +270,35 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 85,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/markdown": [
-       "## MARKETS IN REVIEW\n",
-       "\n",
-       "## Equities\n",
-       "\n",
-       "|                       | Total Return in USD (%)   | Total Return in USD (%)   | Total Return in USD (%)   | Total Return in USD (%)   |\n",
-       "|-----------------------|---------------------------|---------------------------|---------------------------|---------------------------|\n",
-       "|                       | Current                   | WTD                       | MTD                       | YTD                       |\n",
-       "| DJIA                  | 43,828.06                 | -1.8                      | -2.3                      | 18.4                      |\n",
-       "| NASDAQ                | 19,926.72                 | 0.4                       | 3.7                       | 33.7                      |\n",
-       "| S&P 500               | 6,051.09                  | -0.6                      | 0.4                       | 28.6                      |\n",
-       "| S&P 400 Mid Cap       | 3,277.20                  | -1.6                      | -2.6                      | 19.5                      |\n",
-       "| Russell 2000          | 2,346.90                  | -2.5                      | -3.5                      | 17.3                      |\n",
-       "| MSCI World            | 3,817.24                  | -1.0                      | 0.2                       | 22.1                      |\n",
-       "| MSCI EAFE             | 2,319.05                  | -1.5                      | 0.2                       | 6.4                       |\n",
-       "| MSCI Emerging Markets | 1,107.01                  | 0.3                       | 2.7                       | 10.6                      |\n",
-       "\n",
-       "## Fixed Income †\n",
-       "\n",
-       "|                              | Total Return in USD (%)   | Total Return in USD (%)   | Total Return in USD (%)   | Total Return in USD (%)   |\n",
-       "|------------------------------|---------------------------|---------------------------|---------------------------|---------------------------|\n",
-       "|                              | Current                   | WTD                       | MTD                       | YTD                       |\n",
-       "| Corporate & Government       | 4.66                      | -1.34                     | -0.92                     | 1.94                      |\n",
-       "| Agencies                     | 4.54                      | -0.58                     | -0.31                     | 3.35                      |\n",
-       "| Municipals                   | 3.55                      | -0.87                     | -0.54                     | 1.99                      |\n",
-       "| U.S. Investment Grade Credit | 4.79                      | -1.38                     | -0.93                     | 1.97                      |\n",
-       "| International                | 5.17                      | -1.40                     | -0.90                     | 3.20                      |\n",
-       "| High Yield                   | 7.19                      | -0.22                     | 0.20                      | 8.87                      |\n",
-       "| 90 Day Yield                 | 4.32                      | 4.39                      | 4.49                      | 5.33                      |\n",
-       "| 2 Year Yield                 | 4.24                      | 4.10                      | 4.15                      | 4.25                      |\n",
-       "| 10 Year Yield                | 4.40                      | 4.15                      | 4.17                      | 3.88                      |\n",
-       "| 30 Year Yield                | 4.60                      | 4.34                      | 4.36                      | 4.03                      |\n",
-       "\n",
-       "## Commodities & Currencies\n",
-       "\n",
-       "|                       | Total Return in USD (%)   | Total Return in USD (%)   | Total Return in USD (%)   | Total Return in USD (%)   |\n",
-       "|-----------------------|---------------------------|---------------------------|---------------------------|---------------------------|\n",
-       "| Commodities           | Current                   | WTD                       | MTD                       | YTD                       |\n",
-       "| Bloomberg Commodity   | 237.90                    | 1.3                       | 0.7                       | 5.1                       |\n",
-       "| WTI Crude $/Barrel †† | 71.29                     | 6.1                       | 4.8                       | -0.5                      |\n",
-       "| Gold Spot $/Ounce ††  | 2648.23                   | 0.6                       | 0.2                       | 28.4                      |\n",
-       "\n",
-       "## Total Return in USD (%)\n",
-       "\n",
-       "| Currencies   |   Current |   Prior   Week End |   Prior   Month End |   2022   Year End |\n",
-       "|--------------|-----------|--------------------|---------------------|-------------------|\n",
-       "| EUR/USD      |      1.05 |               1.06 |                1.06 |              1.1  |\n",
-       "| USD/JPY      |    153.65 |             150    |              149.77 |            141.04 |\n",
-       "| USD/CNH      |      7.28 |               7.28 |                7.25 |              7.13 |\n",
-       "\n",
-       "## S&P Sector Returns\n",
-       "\n",
-       "<!-- image -->\n",
-       "\n",
-       "Sources: Bloomberg, Factset. Total Returns from the period of 12/9/2024 to 12/13/2024. †Bloomberg Barclays Indices. ††Spot price returns. All data as of the 12/13/2024 close. Data would differ if a different time period was displayed. Short-term performance shown to illustrate more recent trend. Past performance is no guarantee\n",
-       "\n",
-       "of future results.\n",
-       "\n",
-       "## Economic Forecasts (as of 12/13/2024)\n",
-       "\n",
-       "|                                    | Q4 2024E   |   2024E | Q1 2025E   | Q2 2025E   | Q3 2025E   | Q4 2025E   |   2025E |\n",
-       "|------------------------------------|------------|---------|------------|------------|------------|------------|---------|\n",
-       "| Real global GDP (% y/y annualized) | -          |    3.1  | -          | -          | -          | -          |    3.2  |\n",
-       "| Real U.S. GDP (% q/q annualized)   | 2.0        |    2.7  | 2.5        | 2.3        | 2.2        | 2.2        |    2.4  |\n",
-       "| CPI inflation (% y/y)              | 2.7        |    2.9  | 2.3        | 2.3        | 2.7        | 2.5        |    2.5  |\n",
-       "| Core CPI inflation (% y/y)         | 3.3        |    3.4  | 3.0        | 2.9        | 3.2        | 3.1        |    3    |\n",
-       "| Unemployment rate (%)              | 4.2        |    4    | 4.3        | 4.3        | 4.4        | 4.4        |    4.3  |\n",
-       "| Fed funds rate, end period (%)     | 4.38       |    4.38 | 4.13       | 3.88       | 3.88       | 3.88       |    3.88 |\n",
-       "\n",
-       "The forecasts in the table above are the base line view from BofA Global Research. The Global Wealth & Investment Management (GWIM) Investment Strategy Committee (ISC) may make adjustments to this view over the course of the year and can express upside/downside to these forecasts. Historical data is sourced from Bloomberg, FactSet, and\n",
-       "\n",
-       "Haver Analytics. There can be no assurance that the forecasts will be achieved. Economic or financial forecasts are inherently limited and should not be relied on as indicators of future investment performance.\n",
-       "\n",
-       "A = Actual. E/* = Estimate.\n",
-       "\n",
-       "Sources: BofA Global Research; GWIM ISC as of December 13, 2024.\n",
-       "\n",
-       "## Asset Class Weightings (as of 12/3/2024)\n",
-       "\n",
-       "|                                        | CIO View                     | CIO View    | CIO View   | CIO View   | CIO View   |\n",
-       "|----------------------------------------|------------------------------|-------------|------------|------------|------------|\n",
-       "| Asset Class                            | Underweight                  | Underweight | Neutral    | Overweight | Overweight |\n",
-       "| Global Equities                        | slight over weight green    |            |           |            |           |\n",
-       "| U.S. Large Cap Growth                  |                             |            |            |           |           |\n",
-       "| U.S. Large Cap Value                   | Slight over weight green    |            |           |            |           |\n",
-       "| U.S. Small Cap Growth                  | slight over weight green    |            |           |            |           |\n",
-       "| U.S. Small Cap Value                   | slight over weight green    |            |           |            |           |\n",
-       "| International Developed                | Slight underweight orange   |             |           |           |           |\n",
-       "| Emerging Markets                       |                             |            |            |           |           |\n",
-       "| Global Fixed Income                    | slight underweight orange   |             |           |           |           |\n",
-       "| U.S. Governments                       | slight over weight green    |            |           |            |           |\n",
-       "| U.S. Mortgages                         | Slight over weight green    |            |           |            |           |\n",
-       "| U.S. Corporates                        | Slight underweight orange   |             |           |           |           |\n",
-       "| International Fixed Income             |                             |            |            |           |           |\n",
-       "| High Yield                             | Slight underweight orange   |             |           |           |           |\n",
-       "| U.S. Investment-grade                  | Neutral yellow              |            |            |           |           |\n",
-       "| Tax Exempt  U.S. High Yield Tax Exempt | Slight underweight orange   |             |           |           |           |\n",
-       "| Cash                                   |                              |             |            |            |            |\n",
-       "\n",
-       "## CIO Equity Sector Views\n",
-       "\n",
-       "|                         | CIO View                     | CIO View    | CIO View   | CIO View   | CIO View   |\n",
-       "|-------------------------|------------------------------|-------------|------------|------------|------------|\n",
-       "| Sector                  |                              | Underweight | Neutral    |            | Overweight |\n",
-       "| Utilities               | slight over weight green    |            |           |            |           |\n",
-       "| Financials              | slight over weight green    |            |           |            |           |\n",
-       "| Healthcare              | slight over weight green    |            |           |            |           |\n",
-       "| Consumer  Discretionary | Slight over weight green    |            |           |            |           |\n",
-       "| Information  Technology | Neutral yellow              |            |            |           |           |\n",
-       "| Communication  Services | Neutral yellow              |            |            |           |           |\n",
-       "| Industrials             | Neutral yellow              |            |            |           |           |\n",
-       "| Real Estate             | Neutral yellow              |            |            |           |           |\n",
-       "| Energy                  | slight underweight orange   |             |           |           |           |\n",
-       "| Materials               | slight underweight orange   |             |           |           |           |\n",
-       "| Consumer  Staples       | underweight red              |            |           |           |           |\n",
-       "\n",
-       "CIO asset class views are relative to the CIO Strategic Asset Allocation (SAA) of a multi-asset portfolio. Source: Chief Investment Office as of December 3, 2024. All sector and asset allocation recommendations must be considered in the context of an individual investor's goals, time horizon, liquidity needs and risk tolerance. Not all recommendations will be in the best interest of all investors."
-      ],
-      "text/plain": [
-       "<IPython.core.display.Markdown object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
    "source": [
     "display(Markdown(forecast_result_docling))"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "{numref}`docling` shows part of the parsed result from Docling."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "```{figure} ../_static/input/docling.png\n",
+    "---\n",
+    "name: docling\n",
+    "alt: Docling's result\n",
+    "scale: 60%\n",
+    "align: center\n",
+    "---\n",
+    "Docling's parsed result\n",
+    "```\n"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -409,85 +308,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 96,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/markdown": [
-       "Economic Forecasts (as of 12/13/2024)\n",
-       "\n",
-       "Real global GDP (% y/y annualized)\n",
-       "Real U.S. GDP (% q/q annualized)\n",
-       "CPI inflation (% y/y)\n",
-       "Core CPI inflation (% y/y)\n",
-       "Unemployment rate (%)\n",
-       "Fed funds rate, end period (%)\n",
-       "\n",
-       "Q4 2024E\n",
-       "-\n",
-       "2.0\n",
-       "2.7\n",
-       "3.3\n",
-       "4.2\n",
-       "4.38\n",
-       "\n",
-       "2024E\n",
-       "3.1\n",
-       "2.7\n",
-       "2.9\n",
-       "3.4\n",
-       "4.0\n",
-       "4.38\n",
-       "\n",
-       "Q1 2025E  Q2 2025E  Q3 2025E  Q4 2025E\n",
-       "\n",
-       "-\n",
-       "2.5\n",
-       "2.3\n",
-       "3.0\n",
-       "4.3\n",
-       "4.13\n",
-       "\n",
-       "-\n",
-       "2.3\n",
-       "2.3\n",
-       "2.9\n",
-       "4.3\n",
-       "3.88\n",
-       "\n",
-       "-\n",
-       "2.2\n",
-       "2.7\n",
-       "3.2\n",
-       "4.4\n",
-       "3.88\n",
-       "\n",
-       "-\n",
-       "2.2\n",
-       "2.5\n",
-       "3.1\n",
-       "4.4\n",
-       "3.88\n",
-       "\n",
-       "2025E\n",
-       "3.2\n",
-       "2.4\n",
-       "2.5\n",
-       "3.0\n",
-       "4.3\n",
-       "3.88\n",
-       "\n",
-       "The forecasts in the table above are the base line view f"
-      ],
-      "text/plain": [
-       "<IPython.core.display.Markdown object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
    "source": [
     "from IPython.display import display, Markdown\n",
     "display(Markdown(forecast_result_md[:500]))"
@@ -497,13 +320,30 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Now, let's focus on the economic forecasts. In particular, we are interested in the CIO's 2025E forecasts.\n",
+    "{numref}`markitdown` shows part of the parsed result from MarkItDown.\n",
     "\n",
-    "```{figure} ../data/input/2025.png\n",
+    "```{figure} ../_static/input/markitdown.png\n",
+    "---\n",
+    "name: markitdown\n",
+    "alt: MarkItDown's parsed result\n",
+    "scale: 60%\n",
+    "align: center\n",
+    "---\n",
+    "MarkItDown's parsed result\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now, let's focus on the economic forecasts. In particular, we are interested in extracting the CIO's 2025E forecasts.\n",
+    "\n",
+    "```{figure} ../_static/input/2025.png\n",
     "---\n",
     "name: forecast2025\n",
     "alt: Forecast 2025\n",
-    "scale: 60%\n",
+    "scale: 45%\n",
     "align: center\n",
     "---\n",
     "Forecast 2025\n",
@@ -530,7 +370,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "We write a simple function to extract the economic forecasts from the document using an LLM model (with structured output) using the following prompt template, where `extract_prompt` is kind of data the user would like to extract and `doc` is the input document to analyze."
+    "We write a simple function to extract the economic forecasts from the document using an LLM model (with structured output) with the following prompt template, where `extract_prompt` is kind of data the user would like to extract and `doc` is the input document to analyze."
    ]
   },
   {
@@ -623,7 +463,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "The response is a `EconForecast` object containing a list of `Forecast` objects. We can then convert the response to a pandas DataFrame for easier comparison."
+    "The response is an `EconForecast` object containing a list of `Forecast` objects, as defined in the pydantic model. We can then convert the response to a pandas DataFrame for easier comparison."
    ]
   },
   {
@@ -830,19 +670,19 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "The results from both MarkItDown and Docling are identical and accurately match the true values from the document. This demonstrates that despite MarkItDown's output appearing less readable from a human perspective, both approaches successfully extracted the economic forecast data with equal precision. The formatting differences between the two methods did not impact their ability to capture and structure the underlying information at least in this particular case."
+    "The results from MarkItDown and Docling are identical and accurately match the true values from the document. This demonstrates that despite MarkItDown's output appearing less readable from a human perspective, both approaches enabled the LLM to successfully extract the economic forecast data with equal accuracy, in this particular case."
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Now, let's focus on the asset class weightings. We will extract the asset class weightings from the document and compare the results from MarkItDown and Docling. The information now is presented in a quite different structure. The CIO view is represented in a spectrum from \"Underweight\", passing through \"Neutral\" to \"Overweight\". And the actual view is marked by some colored dots. Let's see if we can extract the information from the document.\n",
-    "```{figure} ../data/input/asset_class.png\n",
+    "Now, let's focus on the asset class weightings. We will extract the asset class weightings from the document and compare the results from MarkItDown and Docling. The information now is presented in a quite different structure. The CIO view information is represented in a spectrum from starting with \"Underweight\", passing through \"Neutral\" and reaching \"Overweight\". The actual view is marked by some colored dots in the chart. Let's see if we can extract this information from the document.\n",
+    "```{figure} ../_static/input/asset_class.png\n",
     "---\n",
     "name: asset_class\n",
     "alt: Asset Class Weightings\n",
-    "scale: 60%\n",
+    "scale: 50%\n",
     "align: center\n",
     "---\n",
     "Asset Class Weightings\n",
@@ -884,7 +724,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Now we construct a DataFrame to compare the results from MarkItDown and Docling with an added \"true_value\" column containing the true values from the document."
+    "Now we construct a DataFrame to compare the results from MarkItDown and Docling with an added \"true_value\" column containing the true values from the document, which we extracted manually from the chart."
    ]
   },
   {
@@ -1091,7 +931,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Docling performs significantly better at 93.33% accuracy missing only one value. MarkItDown achieves 53.33% accuracy, struggling with nuanced asset class weightings. In this case, Docling's structured parsed output did help the LLM to extract the information more accurately compared to MarkItDown's unstructured output."
+    "Docling performs significantly better at 93.33% accuracy missing only one value. MarkItDown achieves 53.33% accuracy, struggling with nuanced asset class weightings. In this case, Docling's structured parsed output did help the LLM to extract the information more accurately compared to MarkItDown's unstructured output. Hence, in this case, the strategy used to parse the data did impact the LLM's ability to extract the information. A more robust analysis would run data extraction on a large sample data a number of repeated runs to estimate error rates."
    ]
   },
   {
@@ -1100,8 +940,8 @@
    "source": [
     "What if we want to systematically extract all tables from the document? We can use Docling to do that by simply accessing the `tables` attribute of the `DocumentConverter` object.\n",
     "\n",
-    "We observe that Docling extracted 7 tables from the document. Exporting tables from top down left to right in order of appearance.\n",
-    "We can see the first table successfully extracted for Equities forecasts, the second one for Fixed Income forecasts. We also display the last table, which contains CIO Equity Sector Views.\n"
+    "By doing that, we observe that Docling extracted 7 tables from the document. Exporting tables from top down and left to right in order of appearance in the document.\n",
+    "Below, we can see the first table successfully extracted for Equities forecasts, the second one for Fixed Income forecasts as well as the last table, which contains CIO Equity Sector Views.\n"
    ]
   },
   {
@@ -1663,7 +1503,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Coming back to MarkItDown, one interesting feature to explore is the ability to extract information from images by passing an image capable LLM model."
+    "Coming back to MarkItDown, one interesting feature to explore is the ability to extract information from images by passing an image capable LLM model to its constructor."
    ]
   },
   {
@@ -1688,13 +1528,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Here's the description we obtain from the image of our input document. Overall, the description is somewhat accurate but contains a few inaccuracies including:\n",
-    "\n",
-    "- For the sector weightings, the description states there are \"underweight positions in U.S. Small Cap Growth\" but looking at the Asset Class Weightings chart, U.S. Small Cap Growth actually shows an overweight position (green circle).\n",
-    "- The description mentions \"overweight positions in certain sectors such as Utilities and Financials\" but looking at the CIO Equity Sector Views, both these sectors show neutral positions, not overweight positions.\n",
-    "- For fixed income, the description cites a \"10-Year (4.03%)\" yield, but the image shows the 30-Year Yield at 4.03%, while the 10-Year Yield is actually 4.40%.\n",
-    "\n",
-    "Arguably, the description's inaccuracies could be a consequence of the underlying LLM model's inability to process the image. Further research is needed to determine if this is the case.\n"
+    "Here's the description we obtain from the image of our input document."
    ]
   },
   {
@@ -1742,15 +1576,30 @@
     "display(Markdown(result.text_content))"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "\n",
+    "Overall, the description is somewhat accurate but contains a few inaccuracies including:\n",
+    "\n",
+    "- For the sector weightings, the description states there are \"underweight positions in U.S. Small Cap Growth\" but looking at the Asset Class Weightings chart, U.S. Small Cap Growth actually shows an overweight position (green circle).\n",
+    "- The description mentions \"overweight positions in certain sectors such as Utilities and Financials\" but looking at the CIO Equity Sector Views, both these sectors show neutral positions, not overweight positions.\n",
+    "- For fixed income, the description cites a \"10-Year (4.03%)\" yield, but the image shows the 30-Year Yield at 4.03%, while the 10-Year Yield is actually 4.40%.\n",
+    "\n",
+    "Arguably, the description's inaccuracies could be a consequence of the underlying LLM model's inability to process the image. Further research is needed to determine if this is the case."
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "## Retrieval-Augmented Generation\n",
     "\n",
-    "RAG is a technique that allows LLMs to retrieve information from a knowledge base to answer questions. It is a popular technique for building LLM applications that require knowledge-intensive tasks.\n",
+    "RAG is a technique that allows LLMs to retrieve information from a knowledge base to answer questions. It is a popular technique for building LLM applications that require knowledge-intensive tasks {cite}`lewis2021retrievalaugmentedgenerationknowledgeintensivenlp`.\n",
     "\n",
-    "{cite}`lewis2021retrievalaugmentedgenerationknowledgeintensivenlp`"
+    "RAG utilizes a retrieval system to fetch external knowledge and augment the LLM. It has proved effective in mitigating hallucinations of LLMs {cite}`10.1145/3589334.3645481, ni-etal-2024-llms`."
    ]
   },
   {
@@ -1774,12 +1623,12 @@
    "source": [
     "### Case Study I: Content Chunking with Contextual Linking\n",
     "\n",
-    "Content chunking with contextual linking is a technique used to manage the `max_output_tokens` limitation by breaking down long-form content into smaller, manageable chunks while keeping chunk-specific context. This approach tackles three problems:\n",
+    "Content chunking with contextual linking is a technique to break down long-form content into smaller, manageable chunks while keeping chunk-specific context. This approach tackles three problems:\n",
     "1. The LLM's inability to process long inputs to do context-size limits\n",
     "2. The LLM's inability to generate long-form content due to the `max_output_tokens` limitation.\n",
     "3. The LLM's inability to maintain coherence and context when generating responses per chunks\n",
     "\n",
-    "The following steps are followed to implement content chunking with contextual linking:\n",
+    "Here, we exemplify this technique by following these steps:\n",
     "1. **Chunking the Content**: The input content is split into smaller chunks. This allows the LLM to process each chunk individually, focusing on generating a complete and detailed response for that specific section of the input.\n",
     "\n",
     "2. **Maintaining Context**: Each chunk is linked with contextual information from the previous chunks. This helps in maintaining the flow and coherence of the content across multiple chunks.\n",
@@ -2105,7 +1954,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 105,
+   "execution_count": 109,
    "metadata": {},
    "outputs": [
     {
@@ -2131,72 +1980,10 @@
        "\n",
        "**2.2 Operational Risks:**\n",
        "\n",
-       "Several operational risks could significantly impact Apple's performance:\n",
-       "\n",
-       "* **Employee Retention:**  Competition for highly skilled employees, particularly in Silicon Valley, poses a significant risk.  Failure to retain key personnel or maintain its distinctive culture could negatively affect innovation, product development, and overall operational efficiency.  **Market-moving insight:**  Any significant changes in employee turnover rates or negative press regarding Apple's workplace culture could negatively impact investor sentiment.\n",
-       "\n",
-       "* **Reseller Dependence:** Apple's reliance on carriers, wholesalers, and retailers for product distribution introduces risks related to their financial health, distribution decisions, and potential changes in financing or subsidy programs.  **Market-moving insight:**  Monitoring the financial performance of key resellers and any changes in their distribution strategies is crucial.\n",
-       "\n",
-       "* **Information Technology and Cybersecurity:**  Apple's dependence on complex IT systems makes it vulnerable to system failures, network disruptions, and cybersecurity threats (including ransomware attacks).  These events could disrupt operations, damage reputation, and impact sales.  The Form 10-K highlights the company's proactive measures, but acknowledges that these may not be sufficient to prevent all incidents.  **Market-moving insight:**  Any major cybersecurity breach or significant service outage could trigger a negative market reaction.\n",
-       "\n",
-       "**2.3 Legal and Regulatory Risks:**\n",
-       "\n",
-       "Apple faces significant legal and regulatory challenges:\n",
-       "\n",
-       "* **Antitrust Litigation:**  The ongoing antitrust lawsuits in the U.S. and investigations in Europe concerning App Store practices pose a substantial risk.  Adverse outcomes could result in significant fines, changes to business practices, and reputational damage.  **Market-moving insight:**  The progress and outcomes of these legal proceedings will be closely watched by the market.  Any negative developments could significantly impact Apple's stock price.\n",
-       "\n",
-       "* **Digital Markets Act (DMA) Compliance:**  Apple's efforts to comply with the DMA in the EU introduce uncertainty and potential costs.  Non-compliance could lead to substantial fines.  **Market-moving insight:**  The Commission's ongoing investigations and any subsequent decisions will be closely monitored.\n",
-       "\n",
-       "* **Data Privacy and Protection:**  Increasingly stringent data privacy regulations worldwide impose significant compliance costs and risks.  Non-compliance could result in penalties and reputational harm.  **Market-moving insight:**  Any significant fines or negative publicity related to data privacy violations could negatively impact Apple's stock price.\n",
-       "\n",
-       "* **Other Legal Proceedings:**  The Form 10-K notes that Apple is subject to various other legal proceedings, the outcomes of which are uncertain and could materially affect its financial condition.\n",
-       "\n",
-       "**2.4 Financial Risks:**\n",
-       "\n",
-       "Several financial risks could impact Apple's performance:\n",
-       "\n",
-       "* **Sales and Profit Margin Volatility:**  Apple's quarterly net sales and profit margins are subject to fluctuations due to various factors, including pricing pressures, competition, product life cycles, supply chain issues, and macroeconomic conditions.  **Market-moving insight:**  Any significant deviation from expected sales or profit margins could trigger market reactions.\n",
-       "\n",
-       "* **Foreign Exchange Rate Fluctuations:**  Apple's international operations expose it to risks associated with changes in the value of the U.S. dollar.  Fluctuations in exchange rates can impact sales, earnings, and gross margins.  **Market-moving insight:**  Significant movements in major currency exchange rates relative to the USD should be monitored for their potential impact on Apple's financial results.\n",
-       "\n",
-       "* **Credit Risk and Investment Portfolio:**  Apple's exposure to credit risk on trade receivables and fluctuations in the value of its investment portfolio could lead to losses.  **Market-moving insight:**  Any significant deterioration in the creditworthiness of key customers or a substantial decline in the value of Apple's investment portfolio could be viewed negatively by the market.\n",
-       "\n",
-       "* **Tax Risks:**  Changes in tax rates, new tax legislation, and tax audits could materially affect Apple's financial performance.\n",
        "\n",
        " (...) \n",
        "\n",
-       " **4.10 Debt and Share Repurchases:**\n",
-       "\n",
-       "Note 9 details Apple's debt structure, including commercial paper and term debt.  While the company has a strong credit rating, the significant amount of debt and the high weighted-average interest rate on commercial paper (5.00% in 2024) indicate potential interest rate risk.  Note 10 highlights the substantial share repurchase program ($95 billion in 2024), which, while returning value to shareholders, could limit funds available for future investments or acquisitions.  **Market-moving insight:**  Investors will monitor the balance between debt levels, share repurchases, and investments in future growth.\n",
-       "\n",
-       "**4.11 Share-Based Compensation:**\n",
-       "\n",
-       "Note 11 shows a steady increase in share-based compensation expense, reflecting Apple's reliance on equity-based incentives to attract and retain talent.  The significant unrecognized compensation cost related to outstanding RSUs ($19.4 billion in 2024) represents a future expense commitment.  **Market-moving insight:**  Changes in share-based compensation policies or unexpected increases in expense could impact future profitability.\n",
-       "\n",
-       "**4.12 Commitments and Supply Concentrations:**\n",
-       "\n",
-       "Note 12 reveals Apple's substantial unconditional purchase obligations, primarily for suppliers, licensed intellectual property, and content.  These commitments represent significant future cash outflows and highlight the company's dependence on its supply chain.  **Market-moving insight:**  Any disruptions in the supply chain or changes in supplier relationships could negatively impact Apple's production and sales.\n",
-       "\n",
-       "\n",
-       "This detailed analysis reveals several key risk factors and market-moving insights beyond those identified in Part 3.  Investors and analysts should carefully consider these factors when assessing Apple's future performance and valuation.\n",
-       "\n",
-       "**PART 5: Contingencies, Supply Chain, and Segment Analysis**\n",
-       "\n",
-       "This section analyzes additional information from Apple Inc.'s 2024 Form 10-K, focusing on contingencies, supply chain risks, and a deeper dive into segment performance.\n",
-       "\n",
-       "**5.1 Contingencies and Legal Proceedings:**\n",
-       "\n",
-       "The Form 10-K acknowledges that Apple is involved in various legal proceedings and claims. While management believes no material loss is reasonably possible beyond existing accruals, the inherent uncertainty of litigation remains a risk.  Adverse outcomes in any of these cases could negatively impact Apple's financial condition and reputation.  **Market-moving insight:**  Any significant legal developments or settlements should be closely monitored for their potential market impact.  Increased legal expenses or negative publicity could affect investor sentiment.\n",
-       "\n",
-       "**5.2 Supply Chain Concentration:**\n",
-       "\n",
-       "Apple's reliance on a concentrated network of outsourcing partners, primarily located in a few Asian countries, presents significant risks.  The dependence on single or limited sources for certain custom components exposes Apple to supply chain disruptions, shortages, and price fluctuations.  While Apple uses multiple sources for most components, the unique nature of some components used in new products creates vulnerability.  Suppliers might prioritize common components over custom ones, impacting Apple's ability to produce its innovative products.  **Market-moving insight:**  Any significant supply chain disruptions, geopolitical instability in key manufacturing regions, or changes in supplier relationships could negatively impact Apple's production and sales, triggering a negative market reaction.\n",
-       "\n",
-       "**5.3 Detailed Segment Analysis:**\n",
-       "\n",
-       "Note 13 provides a detailed breakdown of Apple's segment performance.  While the Americas and Europe showed growth, primarily driven by Services revenue, Greater China experienced a decline due to lower iPhone and iPad sales and currency headwinds.  This highlights the regional economic and currency risks impacting Apple's revenue.  The relatively flat year-over-year iPhone sales, despite growth in other product lines, warrants further investigation into market saturation and competitive pressures.  The significant contribution of the Services segment to overall revenue and profitability underscores both its importance and the risk associated with its dependence on this segment.\n",
-       "\n",
-       "The reconciliation of segment operating income to consolidated operating income reveals that research and development (R&D) and other corporate expenses significantly impact overall profitability.  While increased R&D is generally positive, it reduces short-term profits.  The geographical breakdown of net sales and long-lived assets further emphasizes the concentration of Apple's business in the U.S. and China.  **Market-moving insight:**  Continued weakness in the Greater China market, sustained flat iPhone sales, or any significant changes in R&D spending should be closely monitored for their potential impact on Apple's financial performance and investor sentiment.\n",
+       " The reconciliation of segment operating income to consolidated operating income reveals that research and development (R&D) and other corporate expenses significantly impact overall profitability.  While increased R&D is generally positive, it reduces short-term profits.  The geographical breakdown of net sales and long-lived assets further emphasizes the concentration of Apple's business in the U.S. and China.  **Market-moving insight:**  Continued weakness in the Greater China market, sustained flat iPhone sales, or any significant changes in R&D spending should be closely monitored for their potential impact on Apple's financial performance and investor sentiment.\n",
        "\n",
        "\n",
        "**5.4 Auditor's Report and Internal Controls:**\n",
@@ -2232,10 +2019,10 @@
     "    \n",
     "from IPython.display import Markdown\n",
     "\n",
-    "# Display first and last 25% of the report content\n",
+    "# Display first and last 10% of the report content\n",
     "report_lines = report_content.splitlines()\n",
     "total_lines = len(report_lines)\n",
-    "quarter_lines = total_lines // 4\n",
+    "quarter_lines = total_lines // 10\n",
     "\n",
     "top_portion = '\\n'.join(report_lines[:quarter_lines])\n",
     "bottom_portion = '\\n'.join(report_lines[-quarter_lines:])\n",
@@ -2266,7 +2053,18 @@
     "\n",
     "- **Depth of Analysis**: While the report covers a wide range of topics, the depth of analysis in certain sections may not be as comprehensive as a human expert's evaluation. Some nuances and contextual factors might be overlooked by the LLM. Splitting the report into multiple parts helps in mitigating this issue.\n",
     "\n",
-    "- **Chunking Strategy**: The current approach splits the text into chunks based on size, which ensures that each chunk fits within the model's token limit. However, this method may disrupt the logical flow of the document, as sections of interest might be split across multiple chunks. An alternative approach could be \"structured\" chunking, where the text is divided based on meaningful sections or topics. This would preserve the coherence of each section, making it easier to follow and understand. Implementing structured chunking requires additional preprocessing to identify and segment the text appropriately, but it can significantly enhance the readability and logical flow of the generated report.\n"
+    "- **Chunking Strategy**: The current approach splits the text into chunks based on size, which ensures that each chunk fits within the model's token limit. However, this method may disrupt the logical flow of the document, as sections of interest might be split across multiple chunks. An alternative approach could be \"structured\" chunking, where the text is divided based on meaningful sections or topics. This would preserve the coherence of each section, making it easier to follow and understand. Implementing structured chunking requires additional preprocessing to identify and segment the text appropriately, but it can significantly enhance the readability and logical flow of the generated report.\n",
+    "\n",
+    "Here, we implemented a simple strategy to improve the coherence in output generation given a multi-part chunked input. Many other strategies are possible. One related technique worth mentioning is Anthropic's Contextual Retrieval {cite}`anthropic2024contextualretrieval`. The approach, as shown in {numref}`anth_contextual`, employs an LLM itself to generate relevant context per chunk before passing these two pieces of information together to the LLM. This process was proposed in the context of RAGs to enhance its retrieval capabilities but can be applied more generally to improve output generation.\n",
+    "```{figure} ../_static/input/anth_contextual.png\n",
+    "---\n",
+    "name: anth_contextual\n",
+    "alt: Anthropic Contextual Linking\n",
+    "scale: 50%\n",
+    "align: center\n",
+    "---\n",
+    "Anthropic Contextual Linking {cite}`anthropic2024contextualretrieval`.\n",
+    "```"
    ]
   },
   {
@@ -2336,7 +2134,7 @@
     "\n",
     "**Corpus-in-Context Prompting**\n",
     "\n",
-    "The `add()` method is key since it is used to add content to the client. It takes a list of URLs and extracts the content from each URL using a content extractor, which we used MarkitDown. The content is then added to the conversation input in a way that enables citations using the \"Corpus-in-Context\" (CIC) Prompting {cite}`lee2024longcontextlanguagemodelssubsume`.\n",
+    "The `add()` method is key since it is used to add content to the client. It takes a list of URLs and extracts the content from each URL using a content extractor (using MarkitDown). The content is then added to the conversation input memory in a way that enables citations using the \"Corpus-in-Context\" (CIC) Prompting {cite}`lee2024longcontextlanguagemodelssubsume`.\n",
     "\n",
     "{numref}`cic` shows how CIC format is used to enable citations. It inserts a corpus into the prompt. Each candidate citable part (e.g., passage, chapter) in a corpus is assigned a unique identifier (ID) that can be referenced as needed for that task.\n",
     "\n",
@@ -2376,7 +2174,7 @@
     "```\n",
     "\n",
     "\n",
-    "Later, when the user sends a message to the client, the `msg()` method is used to generate a response  while enabling citations. `self.content_generator` is an instance of our LLM model, which we will next.\n",
+    "Later, when the user sends a message to the client, the `msg()` method is used to generate a response  while enabling citations. `self.content_generator` is an instance of our LLM model, which we will go through next.\n",
     "\n",
     "```python\n",
     "    def msg(self, msg: str = \"\", add_citations: bool = False) -> str:\n",
@@ -2397,9 +2195,7 @@
     "\n",
     "LLM-based applications often involve repeatedly passing the same input tokens to a model, which can be inefficient and costly. Context caching addresses this by allowing you to cache input tokens after their first use and reference them in subsequent requests. This approach significantly reduces costs compared to repeatedly sending the same token corpus, especially at scale.\n",
     "\n",
-    "Context caching proves especially valuable when a large initial context needs to be referenced multiple times by smaller requests. By caching the context upfront, these applications can maintain high performance while optimizing token usage and associated costs.\n",
-    "\n",
-    "In our application, the user might pass a large knowledge base to the client that can be referenced multiple times by smaller user requests. Our `Client` class is composed of a `LLMBackend` class that takes the `input_memory` - containing the entire knowledge base and any additional user added content.\n",
+    "In our application, the user might passes a large knowledge base to the client that can be referenced multiple times by smaller user requests. Our `Client` class is composed of a `LLMBackend` class that takes the `input_memory` containing the entire knowledge base and any additional user added content.\n",
     "```python\n",
     "self.llm = LLMBackend(input=self.input_memory)\n",
     "```\n",
@@ -2425,7 +2221,7 @@
     "\n",
     "Coming back to our `Client` class, we implement the `quiz()` method to generate a quiz based on the full input memory, i.e. the initial knowledge base and any additional user added content.\n",
     "\n",
-    "The `quiz()` method returns a `Quiz` instance which behind the scenes caches input tokens. The user later can invoke the `generate()` method to generate a quiz passing the user instructions in `msg` parameter, as we will see later.\n",
+    "The `quiz()` method returns a `Quiz` instance which behind the scenes caches input tokens. The user later can invoke its `generate()` method to generate a quiz passing the user instructions in `msg` parameter, as we will see later.\n",
     "\n",
     "```python\n",
     "    def quiz(self, add_citations: bool = True, num_questions: int = 10) -> str:\n",
@@ -2449,7 +2245,8 @@
     "> - Answers should be provided at the end of the quiz.\n",
     "> - Questions should have broad coverage of the input including multiple Input IDs.\n",
     "> - Level of difficulty is advanced/hard.\n",
-    "> - {citations}\n",
+    "> - {{citations}}\n",
+    ">\n",
     "> STRUCTURE:\n",
     "> - Sequence of questions and alternatives.\n",
     "> - At the end provide the correct answers.\n",
@@ -2497,7 +2294,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "We will import our module as `genai_duo` and initialize the `Client` class with our knowledge base."
+    "We will import our module `gemini_duo` as `genai_duo` and initialize the `Client` class with our knowledge base."
    ]
   },
   {
@@ -2562,7 +2359,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "{numref}`quiz` shows a sample sample quiz with citations. Marked in yellow are the citations which refer to the input IDs of the resources we added to the model.\n",
+    "{numref}`quiz` shows a sample quiz with citations. Marked in yellow are the citations which refer to the input IDs of the resources we added to the model.\n",
     "\n",
     "```{figure} ../_static/input/quiz.png\n",
     "---\n",
@@ -2581,24 +2378,17 @@
    "source": [
     "#### Discussion\n",
     "\n",
-    "The experiment demonstrated the ability to build a knowledge base from multiple sources and generate quizzes with citations. The system successfully ingested content from Project Gutenberg texts, including historical documents like the Magna Carta, and used them to create interactive educational content.\n",
+    "The experiment demonstrated the ability to build a knowledge base from multiple sources while leveraging prompt caching for efficiency and generate quizzes with citations for verifiability. The system successfully ingested content from Project Gutenberg texts, including historical documents like the Magna Carta, and used them to create interactive educational content.\n",
     "\n",
     "However, several limitations emerged during this process:\n",
     "\n",
     "1. Memory Management: The system currently loads all content into memory, which could become problematic with larger knowledge bases. A more scalable approach might involve chunking or streaming the content.\n",
     "\n",
-    "2. Context Window Constraints: With 38,470 tokens cached, we are approaching typical context window limits of many LLMs. This restricts how much knowledge can be referenced simultaneously during generation.\n",
-    "\n",
-    "3. Citation Quality: While the system provides citations, they lack specificity - pointing to entire documents rather than specific passages or page numbers. This limits the ability to fact-check or verify specific claims.\n",
-    "\n",
-    "4. Content Verification: The system does not currently verify the accuracy of generated quiz questions against the source material. This could lead to potential hallucinations or misinterpretations.\n",
-    "\n",
-    "5. Input Format Limitations: The current implementation works well with plain text but may struggle with more complex document formats or structured data sources.\n",
-    "\n",
-    "These limitations highlight opportunities for future improvements in knowledge management and citation systems when building LLM-powered educational tools.\n",
+    "2. Citation Quality: While the system provides citations, they lack specificity - pointing to entire documents rather than specific passages or page numbers. This limits the ability to fact-check or verify specific claims.\n",
     "\n",
+    "3. Content Verification: While citations are provided, the system is not guaranteed to provide factual information. This could lead to potential hallucinations or misinterpretations.\n",
     "\n",
-    "Citation Granularity: While citations are provided, currently they are given at the resource level rather than specific passages."
+    "While limitations are present in this simple example, the case study highlights that not always complex systems are needed. Alternative simple strategies should be preferred when possible, particularly if capable, long-context window models are available and fit within the application requirements.\n"
    ]
   },
   {
diff --git a/tamingllms/notebooks/local.ipynb b/tamingllms/notebooks/local.ipynb
index fa1f01e..7a717ce 100644
--- a/tamingllms/notebooks/local.ipynb
+++ b/tamingllms/notebooks/local.ipynb
@@ -181,11 +181,11 @@
     "Performance Comparison including proprietary models.\n",
     "```\n",
     "\n",
-    "Also from China, DeepSeek-V3 {cite}`deepseek2024v3` represents a major breakthrough in open source language models, emerging as arguably as the most capable open source large language model available today. With 671 billion parameters and 37 billion active MoE (Mixture of Experts) parameters, it achieves performance on par with leading proprietary models like Claude 3.5 Sonnet and GPT 4o as shown in {numref}`deep`. The model demonstrates impressive efficiency metrics (see {numref}`deep2`), processing input tokens at $0.27 per million and output tokens at $1.1 per million, while maintaining a generation speed of 60 tokens per second (3x faster than DeepSeek-V2).\n",
+    "Also from China, DeepSeek-V3 {cite}`deepseek2024v3` represents a major breakthrough in open source language models, emerging as arguably the most capable open source large language model available as of the end of 2024. With 671 billion parameters and 37 billion active MoE (Mixture of Experts) parameters, it achieves performance on par with leading proprietary models like Claude 3.5 Sonnet and GPT 4o as shown in {numref}`deep`. The model demonstrates impressive cost efficiency metrics (see {numref}`deep2`), processing input tokens at $0.27 per million and output tokens at $1.1 per million, while maintaining a generation speed of 60 tokens per second (3x faster than DeepSeek-V2).\n",
     "\n",
-    "What makes DeepSeek-V3 particularly remarkable is that these capabilities were achieved with a relatively modest training budget of just $5.5 million, used to train on 14.8 trillion tokens. This efficiency in training demonstrates the potential for open source models to compete with proprietary alternatives at a fraction of the cost. The model's release marks a significant milestone in the democratization of advanced AI capabilities, challenging the dominance of proprietary models.\n",
+    "What makes DeepSeek-V3 particularly remarkable is that these capabilities were achieved with a relatively modest training budget of just $5.5 million, used to train on 14.8 trillion tokens. This efficiency in training demonstrates the potential for open source models to compete with proprietary alternatives at a fraction of the cost. The model's release marks a significant milestone in the democratization of advanced AI capabilities, challenging the dominance of proprietary models within big tech. One should be cautious though as the model has not yet been battle-tested in the wild but this is an exciting development demonstrating the potential of open source models to compete with proprietary alternatives.\n",
     "\n",
-    "```{figure} ../_static/local/deep.png\n",
+    "```{figure} ../_static/local/deep.jpeg\n",
     "---\n",
     "name: deep\n",
     "alt: DeepSeek-V3\n",
@@ -195,7 +195,7 @@
     "DeepSeek-V3 Performance Comparison\n",
     "```\n",
     "\n",
-    "```{figure} ../_static/local/deep2.png\n",
+    "```{figure} ../_static/local/deep2.jpeg\n",
     "---\n",
     "name: deep2\n",
     "alt: DeepSeek-V3 Cost Benefit Analysis\n",
diff --git a/tamingllms/references.bib b/tamingllms/references.bib
index 1c7b988..bd754d5 100644
--- a/tamingllms/references.bib
+++ b/tamingllms/references.bib
@@ -782,7 +782,7 @@ @misc{lewis2021retrievalaugmentedgenerationknowledgeintensivenlp
 
 @misc{deepseek2024v3,
     title={DeepSeek-V3 Technical Report},
-    author={DeepSeek AI},
+    author={DeepSeek},
     year={2024},
     howpublished={Technical Report},
     url={https://github.com/deepseek-ai/DeepSeek-V3/blob/main/DeepSeek_V3.pdf}
@@ -1186,9 +1186,108 @@ @techreport{ukgov2024airegulation24
       url={https://www.gov.uk/government/publications/ai-regulation-a-pro-innovation-approach/white-paper},
 }
 
+
+@inproceedings{10.1145/3589334.3645481,
+author = {Zhou, Yujia and Liu, Zheng and Jin, Jiajie and Nie, Jian-Yun and Dou, Zhicheng},
+title = {Metacognitive Retrieval-Augmented Large Language Models},
+year = {2024},
+isbn = {9798400701719},
+publisher = {Association for Computing Machinery},
+address = {New York, NY, USA},
+url = {https://doi.org/10.1145/3589334.3645481},
+doi = {10.1145/3589334.3645481},
+booktitle = {Proceedings of the ACM Web Conference 2024},
+pages = {1453-1463},
+numpages = {11},
+keywords = {llms, metacognition, retrieval-augmented generation},
+location = {Singapore, Singapore},
+series = {WWW '24}
+}
+
+@misc{tan2024htmlraghtmlbetterplain,
+      title={HtmlRAG: HTML is Better Than Plain Text for Modeling Retrieved Knowledge in RAG Systems}, 
+      author={Jiejun Tan and Zhicheng Dou and Wen Wang and Mang Wang and Weipeng Chen and Ji-Rong Wen},
+      year={2024},
+      eprint={2411.02959},
+      archivePrefix={arXiv},
+      primaryClass={cs.IR},
+      url={https://arxiv.org/abs/2411.02959}, 
+}
+
+@misc{anthropic2024contextualretrieval,
+      title={Introducing Contextual Retrieval}, 
+      author={{Anthropic}},
+      year={2024},
+      month={09},
+      url={https://www.anthropic.com/news/contextual-retrieval}
+}
+
+
+@article{zhou2024larger,
+author = {Zhou, Lexin and Schellaert, Wout and Plumed, Fernando and Moros-Daval, Yael and Ferri, Cesar and Hernández-Orallo, Jose},
+year = {2024},
+month = {09},
+pages = {61-68},
+title = {Larger and more instructable language models become less reliable},
+volume = {634},
+journal = {Nature},
+doi = {10.1038/s41586-024-07930-y}
+}
+
+@inproceedings{amayuelas-etal-2024-knowledge,
+    title = "Knowledge of Knowledge: Exploring Known-Unknowns Uncertainty with Large Language Models",
+    author = "Amayuelas, Alfonso  and
+      Wong, Kyle  and
+      Pan, Liangming  and
+      Chen, Wenhu  and
+      Wang, William Yang",
+    editor = "Ku, Lun-Wei  and
+      Martins, Andre  and
+      Srikumar, Vivek",
+    booktitle = "Findings of the Association for Computational Linguistics: ACL 2024",
+    month = aug,
+    year = "2024",
+    address = "Bangkok, Thailand",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/2024.findings-acl.383",
+    doi = "10.18653/v1/2024.findings-acl.383",
+    pages = "6416--6432",
+
+}
+
+@inproceedings{
+kotha2024understanding,
+title={Understanding Catastrophic Forgetting in Language Models via Implicit Inference},
+author={Suhas Kotha and Jacob Mitchell Springer and Aditi Raghunathan},
+booktitle={The Twelfth International Conference on Learning Representations},
+year={2024},
+url={https://openreview.net/forum?id=VrHiF2hsrm}
+}
+
+
+
+@inproceedings{ni-etal-2024-llms,
+    title = "When Do {LLM}s Need Retrieval Augmentation? Mitigating {LLM}s{'} Overconfidence Helps Retrieval Augmentation",
+    author = "Ni, Shiyu  and
+      Bi, Keping  and
+      Guo, Jiafeng  and
+      Cheng, Xueqi",
+    editor = "Ku, Lun-Wei  and
+      Martins, Andre  and
+      Srikumar, Vivek",
+    booktitle = "Findings of the Association for Computational Linguistics: ACL 2024",
+    month = aug,
+    year = "2024",
+    address = "Bangkok, Thailand",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/2024.findings-acl.675",
+    doi = "10.18653/v1/2024.findings-acl.675",
+    pages = "11375--11388",
+}
+
 @misc{meta2024llamaguard,
       title={LlamaGuard: LLM-based Input-Output Safeguard for Human-AI Conversations}, 
-      author={Meta AI},
+      author={Meta-AI},
       year={2024},
       howpublished={Meta AI Research Publications},
       url={https://ai.meta.com/research/publications/llama-guard-llm-based-input-output-safeguard-for-human-ai-conversations/},