diff --git a/tamingllms/_build/.doctrees/environment.pickle b/tamingllms/_build/.doctrees/environment.pickle index 19c7d07..2659c72 100644 Binary files a/tamingllms/_build/.doctrees/environment.pickle and b/tamingllms/_build/.doctrees/environment.pickle differ diff --git a/tamingllms/_build/.doctrees/markdown/preface.doctree b/tamingllms/_build/.doctrees/markdown/preface.doctree index b0f353c..7359998 100644 Binary files a/tamingllms/_build/.doctrees/markdown/preface.doctree and b/tamingllms/_build/.doctrees/markdown/preface.doctree differ diff --git a/tamingllms/_build/.doctrees/markdown/toc.doctree b/tamingllms/_build/.doctrees/markdown/toc.doctree index 50ce0d7..426fa06 100644 Binary files a/tamingllms/_build/.doctrees/markdown/toc.doctree and b/tamingllms/_build/.doctrees/markdown/toc.doctree differ diff --git a/tamingllms/_build/.doctrees/notebooks/alignment.doctree b/tamingllms/_build/.doctrees/notebooks/alignment.doctree index 4c0ba1b..c5826fd 100644 Binary files a/tamingllms/_build/.doctrees/notebooks/alignment.doctree and b/tamingllms/_build/.doctrees/notebooks/alignment.doctree differ diff --git a/tamingllms/_build/.doctrees/notebooks/cost.doctree b/tamingllms/_build/.doctrees/notebooks/cost.doctree index 6c13c11..ff51b9b 100644 Binary files a/tamingllms/_build/.doctrees/notebooks/cost.doctree and b/tamingllms/_build/.doctrees/notebooks/cost.doctree differ diff --git a/tamingllms/_build/.doctrees/notebooks/evals.doctree b/tamingllms/_build/.doctrees/notebooks/evals.doctree index c3de329..c036c74 100644 Binary files a/tamingllms/_build/.doctrees/notebooks/evals.doctree and b/tamingllms/_build/.doctrees/notebooks/evals.doctree differ diff --git a/tamingllms/_build/.doctrees/notebooks/input.doctree b/tamingllms/_build/.doctrees/notebooks/input.doctree new file mode 100644 index 0000000..3493903 Binary files /dev/null and b/tamingllms/_build/.doctrees/notebooks/input.doctree differ diff --git a/tamingllms/_build/.doctrees/notebooks/local.doctree b/tamingllms/_build/.doctrees/notebooks/local.doctree index be91ddb..54b5b67 100644 Binary files a/tamingllms/_build/.doctrees/notebooks/local.doctree and b/tamingllms/_build/.doctrees/notebooks/local.doctree differ diff --git a/tamingllms/_build/.doctrees/notebooks/safety.doctree b/tamingllms/_build/.doctrees/notebooks/safety.doctree index a49b2c3..c06a8bd 100644 Binary files a/tamingllms/_build/.doctrees/notebooks/safety.doctree and b/tamingllms/_build/.doctrees/notebooks/safety.doctree differ diff --git a/tamingllms/_build/.doctrees/notebooks/structured_output.doctree b/tamingllms/_build/.doctrees/notebooks/structured_output.doctree index 526478c..d0960da 100644 Binary files a/tamingllms/_build/.doctrees/notebooks/structured_output.doctree and b/tamingllms/_build/.doctrees/notebooks/structured_output.doctree differ diff --git a/tamingllms/_build/html/_images/2025.png b/tamingllms/_build/html/_images/2025.png new file mode 100644 index 0000000..e5e7914 Binary files /dev/null and b/tamingllms/_build/html/_images/2025.png differ diff --git a/tamingllms/_build/html/_images/anth_contextual.png b/tamingllms/_build/html/_images/anth_contextual.png new file mode 100644 index 0000000..c8401c0 Binary files /dev/null and b/tamingllms/_build/html/_images/anth_contextual.png differ diff --git a/tamingllms/data/input/asset_class.png b/tamingllms/_build/html/_images/asset_class.png similarity index 100% rename from tamingllms/data/input/asset_class.png rename to tamingllms/_build/html/_images/asset_class.png diff --git a/tamingllms/_build/html/_images/cic.png b/tamingllms/_build/html/_images/cic.png new file mode 100644 index 0000000..5b180e5 Binary files /dev/null and b/tamingllms/_build/html/_images/cic.png differ diff --git a/tamingllms/_build/html/_images/deep.jpeg b/tamingllms/_build/html/_images/deep.jpeg new file mode 100644 index 0000000..342a13b Binary files /dev/null and b/tamingllms/_build/html/_images/deep.jpeg differ diff --git a/tamingllms/_build/html/_images/deep2.jpeg b/tamingllms/_build/html/_images/deep2.jpeg new file mode 100644 index 0000000..c370001 Binary files /dev/null and b/tamingllms/_build/html/_images/deep2.jpeg differ diff --git a/tamingllms/_build/html/_images/diagram1.png b/tamingllms/_build/html/_images/diagram1.png new file mode 100644 index 0000000..1470769 Binary files /dev/null and b/tamingllms/_build/html/_images/diagram1.png differ diff --git a/tamingllms/_build/html/_images/docling.png b/tamingllms/_build/html/_images/docling.png new file mode 100644 index 0000000..143ded9 Binary files /dev/null and b/tamingllms/_build/html/_images/docling.png differ diff --git a/tamingllms/_build/html/_images/forecast.png b/tamingllms/_build/html/_images/forecast.png new file mode 100644 index 0000000..905776c Binary files /dev/null and b/tamingllms/_build/html/_images/forecast.png differ diff --git a/tamingllms/_build/html/_images/harvard.png b/tamingllms/_build/html/_images/harvard.png new file mode 100644 index 0000000..0b60f7d Binary files /dev/null and b/tamingllms/_build/html/_images/harvard.png differ diff --git a/tamingllms/_build/html/_images/markitdown.png b/tamingllms/_build/html/_images/markitdown.png new file mode 100644 index 0000000..282503c Binary files /dev/null and b/tamingllms/_build/html/_images/markitdown.png differ diff --git a/tamingllms/_build/html/_images/quiz.png b/tamingllms/_build/html/_images/quiz.png new file mode 100644 index 0000000..a627f7c Binary files /dev/null and b/tamingllms/_build/html/_images/quiz.png differ diff --git a/tamingllms/_build/html/_sources/notebooks/input.ipynb b/tamingllms/_build/html/_sources/notebooks/input.ipynb new file mode 100644 index 0000000..d05669d --- /dev/null +++ b/tamingllms/_build/html/_sources/notebooks/input.ipynb @@ -0,0 +1,2454 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "(input)=\n", + "# Managing Input Data\n", + "```{epigraph}\n", + "One home run is much better than two doubles.\n", + "\n", + "-- Steve Jobs\n", + "```\n", + "```{contents}\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Introduction\n", + "\n", + "Large Language Models face several critical challenges in effectively processing input data. While advances in long-context language models (LCLMs) {cite}`lee2024longcontextlanguagemodelssubsume` have expanded the amount of information these systems can process simultaneously, significant challenges remain in managing and effectively utilizing extended inputs. \n", + "\n", + "LLMs are sensitive to input formatting and structure, requiring careful data preparation to achieve optimal results {cite}`tan2024htmlraghtmlbetterplain`. They operate with knowledge cutoffs, providing potentially stale or outdated information that may not reflect current reality and demonstrate problems with temporal knowledge accuracy {cite}`amayuelas-etal-2024-knowledge`. LLMs also struggle with less common but important information showing a systematic loss of long-tail knowledge {cite}`kotha2024understanding`.\n", + "\n", + "Motivated by these challenges, this chapter explores two key components:\n", + "\n", + "1. Data Parsing: Parsing documents into a unified format that is suitable for LLMs to process.\n", + "2. Retrieval Augmentation: Augmenting LLMs with the ability to retrieve relevant, recent, and specialized information.\n", + "\n", + "In data parsing, we will explore some useful open source tools that help transform data into LLM-compatible formats, demonstrating their impact through a case study of structured information extraction from complex PDFs. In a second case study, we will introduce some chunking strategies to help LLMs process long inputs and implement a particular technique called Chunking with Contextual Linking the enables contextually relevant chunk processing.\n", + "\n", + "In retrieval augmentation, we will explore how to enhance LLMs with semantic search capabilities for incorporating external context using RAGs (Retrieval Augmented Generation). Through a detailed case study, we build a RAG system for querying live codebases, illustrating methods to bridge static model knowledge with dynamic information requirements.\n", + "\n", + "In our last case study, we build a quiz generator using a LLM with large context window. We will explore some additional relevant techniques such as prompt caching and response verification through citations.\n", + "\n", + "By the chapter's conclusion, readers will possess relevant knowledge of input data management strategies for LLMs and practical expertise in selecting and implementing appropriate approaches and tools for specific use cases." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Parsing Documents\n", + "\n", + "Building robust data ingestion and preprocessing pipelines is essential for any LLM application. This section explores tools and frameworks that streamline input data processing, in particular for parsing purposes, providing a unified interface for converting diverse data formats into standardized representations that LLMs can effectively process. By abstracting away format-specific complexities, they allow developers to focus on core application logic rather than parsing implementation details while maximizing the performance of the LLM.\n", + "\n", + "We will cover open source tools and frameworks that provide parsing capabilities for a wide range of data formats. And we will demonstrate how some of these tools can be used to extract structured information from complex PDFs also discussing how the quality of the parser can impact LLM's performance." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### MarkItDown\n", + "\n", + "MarkItDown is a Python package and CLI too developed by the Microsoft AutoGen team for converting various file formats to Markdown. It supports a wide range of formats including PDF, PowerPoint, Word, Excel, images (with OCR and EXIF metadata), audio (with transcription), HTML, and other text-based formats making it a useful tool for document indexing and LLM-based applications.\n", + "\n", + "Key features:\n", + "- Simple command-line and Python API interfaces\n", + "- Support for multiple file formats\n", + "- Optional LLM integration for enhanced image descriptions\n", + "- Batch processing capabilities\n", + "- Docker support for containerized usage\n", + "\n", + "Sample usage:\n", + "```python\n", + "from markitdown import MarkItDown\n", + "\n", + "md = MarkItDown()\n", + "result = md.convert(\"test.xlsx\")\n", + "print(result.text_content)\n", + "```\n", + "\n", + "### Docling\n", + "\n", + "Docling is a Python package developed by IBM Research for parsing and converting documents into various formats. It provides advanced document understanding capabilities with a focus on maintaining document structure and formatting.\n", + "\n", + "Key features:\n", + "- Support for multiple document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, etc.)\n", + "- Advanced PDF parsing including layout analysis and table extraction\n", + "- Unified document representation format\n", + "- Integration with LlamaIndex and LangChain\n", + "- OCR support for scanned documents\n", + "- Simple CLI interface\n", + "\n", + "Sample usage:\n", + "```python\n", + "from docling.document_converter import DocumentConverter\n", + "\n", + "converter = DocumentConverter()\n", + "result = converter.convert(\"document.pdf\")\n", + "print(result.document.export_to_markdown())\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Frameworks-Based Parsing\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Structured Data Extraction" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A common use case where document parsing matters is to structured data extraction from documents, particularly in the presence of complex formatting and layout. In this case study, we will extract the economic forecasts from Merrill Lynch's CIO Capital Market Outlook released on December 16, 2024 {cite:p}`merrill2024`. We will focus on page 7 of this document, which contains several economic variables organized in a mix of tables, text and images (see {numref}`forecast`)\n", + "\n", + "\n", + "```{figure} ../data/input/forecast.png\n", + "---\n", + "name: forecast\n", + "alt: Forecast\n", + "scale: 50%\n", + "align: center\n", + "---\n", + "Forecast\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "metadata": {}, + "outputs": [], + "source": [ + "FORECAST_FILE_PATH = \"../data/input/forecast.pdf\"\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First, we will use MarkItDown to extract the text content from the document." + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "metadata": {}, + "outputs": [], + "source": [ + "from markitdown import MarkItDown\n", + "\n", + "md = MarkItDown()\n", + "result_md = md.convert(FORECAST_FILE_PATH).text_content" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, we will do the same with Docling." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from docling.document_converter import DocumentConverter\n", + "\n", + "converter = DocumentConverter()\n", + "forecast_result_docling = converter.convert(source).document.export_to_markdown()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "How similar are the two results? We can use use Levenshtein distance to measure the similarity between the two results. We will also calculate a naive score using the `SequenceMatcher` from the `difflib` package, which is a simple measure of the similarity between two strings based on the number of matches in the longest common subsequence." + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "metadata": {}, + "outputs": [], + "source": [ + "import Levenshtein\n", + "def levenshtein_similarity(text1: str, text2: str) -> float:\n", + " \"\"\"\n", + " Calculate normalized Levenshtein distance\n", + " Returns value between 0 (completely different) and 1 (identical)\n", + " \"\"\"\n", + " distance = Levenshtein.distance(text1, text2)\n", + " max_len = max(len(text1), len(text2))\n", + " return 1 - (distance / max_len)\n", + "\n", + "from difflib import SequenceMatcher\n", + "def simple_similarity(text1: str, text2: str) -> float:\n", + " \"\"\"\n", + " Calculate similarity ratio using SequenceMatcher\n", + " Returns value between 0 (completely different) and 1 (identical)\n", + " \"\"\"\n", + " return SequenceMatcher(None, text1, text2).ratio()" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.13985705461925346" + ] + }, + "execution_count": 80, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "levenshtein_similarity(forecast_result_md, forecast_result_docling)" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.17779960707269155" + ] + }, + "execution_count": 81, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "simple_similarity(forecast_result_md, forecast_result_docling)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It turns out that the two results are quite different, with a similarity score of about 13.98% and 17.77% for Levenshtein and `SequenceMatcher` respectively." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Docling's result is a quite readable markdown displaying key economic variables and their forecasts. Conversely, MarkItDown's result is a bit messy and hard to read but the information is there just not in a structured format. Does it matter? That's what we will explore next." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Docling's result**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "display(Markdown(forecast_result_docling))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "{numref}`docling` shows part of the parsed result from Docling." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```{figure} ../_static/input/docling.png\n", + "---\n", + "name: docling\n", + "alt: Docling's result\n", + "scale: 60%\n", + "align: center\n", + "---\n", + "Docling's parsed result\n", + "```\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**MarkItDown's result**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from IPython.display import display, Markdown\n", + "display(Markdown(forecast_result_md[:500]))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "{numref}`markitdown` shows part of the parsed result from MarkItDown.\n", + "\n", + "```{figure} ../_static/input/markitdown.png\n", + "---\n", + "name: markitdown\n", + "alt: MarkItDown's parsed result\n", + "scale: 60%\n", + "align: center\n", + "---\n", + "MarkItDown's parsed result\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, let's focus on the economic forecasts. In particular, we are interested in extracting the CIO's 2025E forecasts.\n", + "\n", + "```{figure} ../_static/input/2025.png\n", + "---\n", + "name: forecast2025\n", + "alt: Forecast 2025\n", + "scale: 45%\n", + "align: center\n", + "---\n", + "Forecast 2025\n", + "```\n", + "\n", + "We will define a `Forecast` pydantic model to represent an economic forecast composed of a `financial_variable` and a `financial_forecast`. We will also define a `EconForecast` pydantic model to represent the list of economic forecasts we want to extract from the document.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "from pydantic import BaseModel\n", + "class Forecast(BaseModel):\n", + " financial_variable: str\n", + " financial_forecast: float\n", + "class EconForecast(BaseModel):\n", + " forecasts: list[Forecast]\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We write a simple function to extract the economic forecasts from the document using an LLM model (with structured output) with the following prompt template, where `extract_prompt` is kind of data the user would like to extract and `doc` is the input document to analyze." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```python\n", + "BASE_PROMPT = f\"\"\"\n", + " ROLE: You are an expert at structured data extraction. \n", + " TASK: Extract the following data {extract_prompt} from input DOCUMENT\n", + " FORMAT: The output should be a JSON object with 'financial_variable' as key and 'financial_forecast' as value.\n", + " \"\"\"\n", + "prompt = f\"{BASE_PROMPT} \\n\\n DOCUMENT: {doc}\"\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "metadata": {}, + "outputs": [], + "source": [ + "def extract_from_doc(extract_prompt: str, doc: str, client) -> EconForecast:\n", + " \"\"\"\n", + " Extract data of a financial document using an LLM model.\n", + " \n", + " Args:\n", + " doc: The financial document text to analyze\n", + " client: The LLM model to use for analysis\n", + " extract_prompt: The prompt to use for extraction\n", + " \n", + " Returns:\n", + " EconForecasts object containing sentiment analysis results\n", + " \"\"\"\n", + "\n", + " BASE_PROMPT = f\"\"\"\n", + " ROLE: You are an expert at structured data extraction. \n", + " TASK: Extract the following data {extract_prompt} from input DOCUMENT\n", + " FORMAT: The output should be a JSON object with 'financial_variable' as key and 'financial_forecast' as value.\n", + " \"\"\"\n", + " prompt = f\"{BASE_PROMPT} \\n\\n DOCUMENT: {doc}\"\n", + " completion = client.beta.chat.completions.parse(\n", + " model=\"gpt-4o-mini\",\n", + " messages=[\n", + " {\n", + " \"role\": \"system\",\n", + " \"content\": prompt\n", + " },\n", + " {\"role\": \"user\", \"content\": doc}\n", + " ],\n", + " response_format=EconForecast\n", + " )\n", + " return completion.choices[0].message.parsed" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "from dotenv import load_dotenv\n", + "import os\n", + "\n", + "# Load environment variables from .env file\n", + "load_dotenv(override=True)\n", + "from openai import OpenAI\n", + "client = OpenAI()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The user then calls the `extract_from_doc` function simply defining that \"Economic Forecasts for 2025E\" is the data they would like to extract from the document. We perform the extraction twice, once with MarkItDown and once with Docling." + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "extract_prompt = \"Economic Forecasts for 2025E\"\n", + "md_financials = extract_from_doc(extract_prompt, forecast_result_md, client)\n", + "docling_financials = extract_from_doc(extract_prompt, forecast_result_docling, client)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The response is an `EconForecast` object containing a list of `Forecast` objects, as defined in the pydantic model. We can then convert the response to a pandas DataFrame for easier comparison." + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "EconForecast(forecasts=[Forecast(financial_variable='Real global GDP (% y/y annualized)', financial_forecast=3.2), Forecast(financial_variable='Real U.S. GDP (% q/q annualized)', financial_forecast=2.4), Forecast(financial_variable='CPI inflation (% y/y)', financial_forecast=2.5), Forecast(financial_variable='Core CPI inflation (% y/y)', financial_forecast=3.0), Forecast(financial_variable='Unemployment rate (%)', financial_forecast=4.3), Forecast(financial_variable='Fed funds rate, end period (%)', financial_forecast=3.88)])" + ] + }, + "execution_count": 99, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "md_financials" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df_md_forecasts = pd.DataFrame([(f.financial_variable, f.financial_forecast) for f in md_financials.forecasts], \n", + " columns=['Variable', 'Forecast'])\n", + "df_docling_forecasts = pd.DataFrame([(f.financial_variable, f.financial_forecast) for f in docling_financials.forecasts], \n", + " columns=['Variable', 'Forecast'])\n" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
VariableForecast
0Real global GDP (% y/y annualized)3.20
1Real U.S. GDP (% q/q annualized)2.40
2CPI inflation (% y/y)2.50
3Core CPI inflation (% y/y)3.00
4Unemployment rate (%)4.30
5Fed funds rate, end period (%)3.88
\n", + "
" + ], + "text/plain": [ + " Variable Forecast\n", + "0 Real global GDP (% y/y annualized) 3.20\n", + "1 Real U.S. GDP (% q/q annualized) 2.40\n", + "2 CPI inflation (% y/y) 2.50\n", + "3 Core CPI inflation (% y/y) 3.00\n", + "4 Unemployment rate (%) 4.30\n", + "5 Fed funds rate, end period (%) 3.88" + ] + }, + "execution_count": 97, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_md_forecasts" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
VariableForecast
0Real global GDP (% y/y annualized)3.20
1Real U.S. GDP (% q/q annualized)2.40
2CPI inflation (% y/y)2.50
3Core CPI inflation (% y/y)3.00
4Unemployment rate (%)4.30
5Fed funds rate, end period (%)3.88
\n", + "
" + ], + "text/plain": [ + " Variable Forecast\n", + "0 Real global GDP (% y/y annualized) 3.20\n", + "1 Real U.S. GDP (% q/q annualized) 2.40\n", + "2 CPI inflation (% y/y) 2.50\n", + "3 Core CPI inflation (% y/y) 3.00\n", + "4 Unemployment rate (%) 4.30\n", + "5 Fed funds rate, end period (%) 3.88" + ] + }, + "execution_count": 98, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_docling_forecasts" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The results from MarkItDown and Docling are identical and accurately match the true values from the document. This demonstrates that despite MarkItDown's output appearing less readable from a human perspective, both approaches enabled the LLM to successfully extract the economic forecast data with equal accuracy, in this particular case." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, let's focus on the asset class weightings. We will extract the asset class weightings from the document and compare the results from MarkItDown and Docling. The information now is presented in a quite different structure. The CIO view information is represented in a spectrum from starting with \"Underweight\", passing through \"Neutral\" and reaching \"Overweight\". The actual view is marked by some colored dots in the chart. Let's see if we can extract this information from the document.\n", + "```{figure} ../_static/input/asset_class.png\n", + "---\n", + "name: asset_class\n", + "alt: Asset Class Weightings\n", + "scale: 50%\n", + "align: center\n", + "---\n", + "Asset Class Weightings\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The user will simply define the following data to extract: \"Asset Class Weightings (as of 12/3/2024) in a scale from -2 to 2\". In that way, we expect that \"Underweight\" will be mapped to -2, \"Neutral\" to 0 and \"Overweight\" to 2 with some values in between." + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [], + "source": [ + "extract_prompt = \"Asset Class Weightings (as of 12/3/2024) in a scale from -2 to 2\"\n", + "asset_class_docling = extract_from_doc(extract_prompt, forecast_result_docling, client)\n", + "asset_class_md = extract_from_doc(extract_prompt, forecast_result_md, client)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "df_md = pd.DataFrame([(f.financial_variable, f.financial_forecast) for f in asset_class_md.forecasts], \n", + " columns=['Variable', 'Forecast'])\n", + "df_docling = pd.DataFrame([(f.financial_variable, f.financial_forecast) for f in asset_class_docling.forecasts], \n", + " columns=['Variable', 'Forecast'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we construct a DataFrame to compare the results from MarkItDown and Docling with an added \"true_value\" column containing the true values from the document, which we extracted manually from the chart." + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
variablemarkitdowndoclingtrue_value
0Global Equities1.01.01.0
1U.S. Large Cap Growth1.01.00.0
2U.S. Large Cap Value1.01.01.0
3U.S. Small Cap Growth1.01.01.0
4U.S. Small Cap Value1.01.01.0
5International Developed1.0-1.0-1.0
6Emerging Markets1.00.00.0
7Global Fixed Income-1.0-1.0-1.0
8U.S. Governments-1.01.01.0
9U.S. Mortgages-1.01.01.0
10U.S. Corporates-1.0-1.0-1.0
11International Fixed Income-1.00.00.0
12High Yield-1.0-1.0-1.0
13U.S. Investment-grade-1.00.00.0
14Tax Exempt U.S. High Yield Tax Exempt-1.0-1.0-1.0
\n", + "
" + ], + "text/plain": [ + " variable markitdown docling true_value\n", + "0 Global Equities 1.0 1.0 1.0\n", + "1 U.S. Large Cap Growth 1.0 1.0 0.0\n", + "2 U.S. Large Cap Value 1.0 1.0 1.0\n", + "3 U.S. Small Cap Growth 1.0 1.0 1.0\n", + "4 U.S. Small Cap Value 1.0 1.0 1.0\n", + "5 International Developed 1.0 -1.0 -1.0\n", + "6 Emerging Markets 1.0 0.0 0.0\n", + "7 Global Fixed Income -1.0 -1.0 -1.0\n", + "8 U.S. Governments -1.0 1.0 1.0\n", + "9 U.S. Mortgages -1.0 1.0 1.0\n", + "10 U.S. Corporates -1.0 -1.0 -1.0\n", + "11 International Fixed Income -1.0 0.0 0.0\n", + "12 High Yield -1.0 -1.0 -1.0\n", + "13 U.S. Investment-grade -1.0 0.0 0.0\n", + "14 Tax Exempt U.S. High Yield Tax Exempt -1.0 -1.0 -1.0" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Create DataFrame with specified columns\n", + "df_comparison = pd.DataFrame({\n", + " 'variable': df_docling['Variable'].iloc[:-1],\n", + " 'markitdown': df_md['Forecast'],\n", + " 'docling': df_docling['Forecast'].iloc[:-1], # Drop last row\n", + " 'true_value': [1.0, 0.0, 1.0, 1.0, 1.0, -1.0, 0.0, -1.0, 1.0, 1.0, -1.0, 0.0, -1.0, 0.0, -1.0]\n", + "})\n", + "\n", + "display(df_comparison)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Markitdown accuracy: 53.33%\n", + "Docling accuracy: 93.33%\n" + ] + } + ], + "source": [ + "# Calculate accuracy for markitdown and docling\n", + "markitdown_accuracy = (df_comparison['markitdown'] == df_comparison['true_value']).mean()\n", + "docling_accuracy = (df_comparison['docling'] == df_comparison['true_value']).mean()\n", + "\n", + "print(f\"Markitdown accuracy: {markitdown_accuracy:.2%}\")\n", + "print(f\"Docling accuracy: {docling_accuracy:.2%}\") \n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Docling performs significantly better at 93.33% accuracy missing only one value. MarkItDown achieves 53.33% accuracy, struggling with nuanced asset class weightings. In this case, Docling's structured parsed output did help the LLM to extract the information more accurately compared to MarkItDown's unstructured output. Hence, in this case, the strategy used to parse the data did impact the LLM's ability to extract the information. A more robust analysis would run data extraction on a large sample data a number of repeated runs to estimate error rates." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "What if we want to systematically extract all tables from the document? We can use Docling to do that by simply accessing the `tables` attribute of the `DocumentConverter` object.\n", + "\n", + "By doing that, we observe that Docling extracted 7 tables from the document. Exporting tables from top down and left to right in order of appearance in the document.\n", + "Below, we can see the first table successfully extracted for Equities forecasts, the second one for Fixed Income forecasts as well as the last table, which contains CIO Equity Sector Views.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [], + "source": [ + "import time\n", + "from pathlib import Path\n", + "import pandas as pd\n", + "from docling.document_converter import DocumentConverter" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [], + "source": [ + "def convert_and_export_tables(file_path: Path) -> list[pd.DataFrame]:\n", + " \"\"\"\n", + " Convert document and export tables to DataFrames.\n", + " \n", + " Args:\n", + " file_path: Path to input document\n", + " \n", + " Returns:\n", + " List of pandas DataFrames containing the tables\n", + " \"\"\"\n", + " doc_converter = DocumentConverter()\n", + " start_time = time.time()\n", + " \n", + " conv_res = doc_converter.convert(file_path)\n", + " \n", + " tables = []\n", + " # Export tables\n", + " for table in conv_res.document.tables:\n", + " table_df: pd.DataFrame = table.export_to_dataframe()\n", + " tables.append(table_df)\n", + "\n", + " end_time = time.time() - start_time\n", + " print(f\"Document converted in {end_time:.2f} seconds.\")\n", + " \n", + " return tables\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Convert and export tables\n", + "tables = convert_and_export_tables(Path(FORECAST_FILE_PATH))" + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "7" + ] + }, + "execution_count": 100, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(tables)" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Total Return in USD (%).CurrentTotal Return in USD (%).WTDTotal Return in USD (%).MTDTotal Return in USD (%).YTD
0DJIA43,828.06-1.8-2.318.4
1NASDAQ19,926.720.43.733.7
2S&P 5006,051.09-0.60.428.6
3S&P 400 Mid Cap3,277.20-1.6-2.619.5
4Russell 20002,346.90-2.5-3.517.3
5MSCI World3,817.24-1.00.222.1
6MSCI EAFE2,319.05-1.50.26.4
7MSCI Emerging Markets1,107.010.32.710.6
\n", + "
" + ], + "text/plain": [ + " Total Return in USD (%).Current \\\n", + "0 DJIA 43,828.06 \n", + "1 NASDAQ 19,926.72 \n", + "2 S&P 500 6,051.09 \n", + "3 S&P 400 Mid Cap 3,277.20 \n", + "4 Russell 2000 2,346.90 \n", + "5 MSCI World 3,817.24 \n", + "6 MSCI EAFE 2,319.05 \n", + "7 MSCI Emerging Markets 1,107.01 \n", + "\n", + " Total Return in USD (%).WTD Total Return in USD (%).MTD \\\n", + "0 -1.8 -2.3 \n", + "1 0.4 3.7 \n", + "2 -0.6 0.4 \n", + "3 -1.6 -2.6 \n", + "4 -2.5 -3.5 \n", + "5 -1.0 0.2 \n", + "6 -1.5 0.2 \n", + "7 0.3 2.7 \n", + "\n", + " Total Return in USD (%).YTD \n", + "0 18.4 \n", + "1 33.7 \n", + "2 28.6 \n", + "3 19.5 \n", + "4 17.3 \n", + "5 22.1 \n", + "6 6.4 \n", + "7 10.6 " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "display(tables[0])" + ] + }, + { + "cell_type": "code", + "execution_count": 102, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Total Return in USD (%).CurrentTotal Return in USD (%).WTDTotal Return in USD (%).MTDTotal Return in USD (%).YTD
0Corporate & Government4.66-1.34-0.921.94
1Agencies4.54-0.58-0.313.35
2Municipals3.55-0.87-0.541.99
3U.S. Investment Grade Credit4.79-1.38-0.931.97
4International5.17-1.40-0.903.20
5High Yield7.19-0.220.208.87
690 Day Yield4.324.394.495.33
72 Year Yield4.244.104.154.25
810 Year Yield4.404.154.173.88
930 Year Yield4.604.344.364.03
\n", + "
" + ], + "text/plain": [ + " Total Return in USD (%).Current \\\n", + "0 Corporate & Government 4.66 \n", + "1 Agencies 4.54 \n", + "2 Municipals 3.55 \n", + "3 U.S. Investment Grade Credit 4.79 \n", + "4 International 5.17 \n", + "5 High Yield 7.19 \n", + "6 90 Day Yield 4.32 \n", + "7 2 Year Yield 4.24 \n", + "8 10 Year Yield 4.40 \n", + "9 30 Year Yield 4.60 \n", + "\n", + " Total Return in USD (%).WTD Total Return in USD (%).MTD \\\n", + "0 -1.34 -0.92 \n", + "1 -0.58 -0.31 \n", + "2 -0.87 -0.54 \n", + "3 -1.38 -0.93 \n", + "4 -1.40 -0.90 \n", + "5 -0.22 0.20 \n", + "6 4.39 4.49 \n", + "7 4.10 4.15 \n", + "8 4.15 4.17 \n", + "9 4.34 4.36 \n", + "\n", + " Total Return in USD (%).YTD \n", + "0 1.94 \n", + "1 3.35 \n", + "2 1.99 \n", + "3 1.97 \n", + "4 3.20 \n", + "5 8.87 \n", + "6 5.33 \n", + "7 4.25 \n", + "8 3.88 \n", + "9 4.03 " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "display(tables[1])" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SectorCIO View.CIO View.UnderweightCIO View.NeutralCIO View.CIO View.Overweight
0Utilitiesslight over weight green 
1Financialsslight over weight green 
2Healthcareslight over weight green 
3Consumer DiscretionarySlight over weight green 
4Information TechnologyNeutral yellow 
5Communication ServicesNeutral yellow 
6IndustrialsNeutral yellow 
7Real EstateNeutral yellow 
8Energyslight underweight orange 
9Materialsslight underweight orange 
10Consumer Staplesunderweight red
\n", + "
" + ], + "text/plain": [ + " Sector CIO View. \\\n", + "0 Utilities slight over weight green  \n", + "1 Financials slight over weight green  \n", + "2 Healthcare slight over weight green  \n", + "3 Consumer Discretionary Slight over weight green  \n", + "4 Information Technology Neutral yellow  \n", + "5 Communication Services Neutral yellow  \n", + "6 Industrials Neutral yellow  \n", + "7 Real Estate Neutral yellow  \n", + "8 Energy slight underweight orange  \n", + "9 Materials slight underweight orange  \n", + "10 Consumer Staples underweight red \n", + "\n", + " CIO View.Underweight CIO View.Neutral CIO View. CIO View.Overweight \n", + "0    \n", + "1    \n", + "2    \n", + "3    \n", + "4    \n", + "5    \n", + "6    \n", + "7    \n", + "8    \n", + "9    \n", + "10     " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "display(tables[6])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Coming back to MarkItDown, one interesting feature to explore is the ability to extract information from images by passing an image capable LLM model to its constructor." + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [], + "source": [ + "md_llm = MarkItDown(llm_client=client, llm_model=\"gpt-4o-mini\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "result = md_llm.convert(\"../data/input/forecast.png\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here's the description we obtain from the image of our input document." + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "\n", + "# Description:\n", + "**Markets in Review: Economic Forecasts and Asset Class Weightings (as of 12/13/2024)**\n", + "\n", + "This detailed market overview presents key performance metrics and economic forecasts as of December 13, 2024.\n", + "\n", + "**Equities Overview:**\n", + "- **Total Returns:** Highlights returns for major indices such as the DJIA (18.4% YTD), NASDAQ (33.7% YTD), and S&P 500 (28.6% YTD), showcasing strong performance across the board.\n", + "- **Forecasts:** Economic indicators reveal a projected real global GDP growth of 3.1%, with inflation rates expected to stabilize around 2.2% in 2025. Unemployment rates are anticipated to remain low at 4.4%.\n", + "\n", + "**Fixed Income:**\n", + "- Focuses on various segments, including Corporate & Government bonds, which offer an annualized return of 4.66% and indicate shifting trends in interest rates over 2-Year (4.25%) and 10-Year (4.03%) bonds.\n", + "\n", + "**Commodities & Currencies:**\n", + "- Commodities such as crude oil and gold show varied performance, with oil increasing by 4.8% and gold prices sitting at $2,648.23 per ounce.\n", + "- Currency metrics highlight the Euro and USD trends over the past year.\n", + "\n", + "**S&P Sector Returns:**\n", + "- A quick reference for sector performance indicates a significant 2.5% return in Communication Services, while other sectors like Consumer Staples and Materials display minor fluctuations.\n", + "\n", + "**CIO Asset Class Weightings:**\n", + "- Emphasizes strategic asset allocation recommendations which are crucial for an investor's portfolio. Underweight positions in U.S. Small Cap Growth and International Developed contrast with overweight positions in certain sectors such as Utilities and Financials, signaling tactical shifts based on ongoing economic assessments.\n", + "\n", + "**Note:** This summary is sourced from BofA Global Research and aims to provide a comprehensive view of current market conditions and forecasts to assist investors in making informed decisions.\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "display(Markdown(result.text_content))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "\n", + "Overall, the description is somewhat accurate but contains a few inaccuracies including:\n", + "\n", + "- For the sector weightings, the description states there are \"underweight positions in U.S. Small Cap Growth\" but looking at the Asset Class Weightings chart, U.S. Small Cap Growth actually shows an overweight position (green circle).\n", + "- The description mentions \"overweight positions in certain sectors such as Utilities and Financials\" but looking at the CIO Equity Sector Views, both these sectors show neutral positions, not overweight positions.\n", + "- For fixed income, the description cites a \"10-Year (4.03%)\" yield, but the image shows the 30-Year Yield at 4.03%, while the 10-Year Yield is actually 4.40%.\n", + "\n", + "Arguably, the description's inaccuracies could be a consequence of the underlying LLM model's inability to process the image. Further research is needed to determine if this is the case." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Retrieval-Augmented Generation\n", + "\n", + "RAG is a technique that allows LLMs to retrieve information from a knowledge base to answer questions. It is a popular technique for building LLM applications that require knowledge-intensive tasks {cite}`lewis2021retrievalaugmentedgenerationknowledgeintensivenlp`.\n", + "\n", + "RAG utilizes a retrieval system to fetch external knowledge and augment the LLM. It has proved effective in mitigating hallucinations of LLMs {cite}`10.1145/3589334.3645481, ni-etal-2024-llms`." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Case Studies\n", + "\n", + "This section presents three case studies that demonstrate practical solutions to common LLM limitations:\n", + "\n", + "First, Content Chunking with Contextual Linking showcases how intelligent chunking strategies can overcome both context window and output token limitations. This case study illustrates techniques for breaking down and reassembling content while maintaining coherence, enabling the generation of high-quality long-form outputs despite model constraints.\n", + "\n", + "Second, a Retrieval Augmented Generation case study addresses the challenge of stale or outdated model knowledge. By implementing semantic search over a GitHub repository, this example demonstrates how to augment LLM responses with current, accurate information - allowing users to query and receive up-to-date answers about code repository contents.\n", + "\n", + "Third, the final case study builds a Quiz generator with citations. This case study explores some additional input management techniques that become particularly useful when long context window is available. This includes implementing prompt caching for efficiency and adding citations to enhance response accuracy and verifiability. These approaches show how to maximize the benefits of larger context models while maintaining response quality." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Case Study I: Content Chunking with Contextual Linking\n", + "\n", + "Content chunking with contextual linking is a technique to break down long-form content into smaller, manageable chunks while keeping chunk-specific context. This approach tackles three problems:\n", + "1. The LLM's inability to process long inputs to do context-size limits\n", + "2. The LLM's inability to generate long-form content due to the `max_output_tokens` limitation.\n", + "3. The LLM's inability to maintain coherence and context when generating responses per chunks\n", + "\n", + "Here, we exemplify this technique by following these steps:\n", + "1. **Chunking the Content**: The input content is split into smaller chunks. This allows the LLM to process each chunk individually, focusing on generating a complete and detailed response for that specific section of the input.\n", + "\n", + "2. **Maintaining Context**: Each chunk is linked with contextual information from the previous chunks. This helps in maintaining the flow and coherence of the content across multiple chunks.\n", + "\n", + "3. **Generating Linked Prompts**: For each chunk, a prompt is generated that includes the chunk's content and its context. This prompt is then used to generate the output for that chunk.\n", + "\n", + "4. **Combining the Outputs**: The outputs of all chunks are combined to form the final long-form content.\n", + "\n", + "Let's examine an example implementation of this technique.\n", + "\n", + "#### Generating long-form content\n", + "\n", + "- Goal: Generate a long-form report analyzing a company's financial statement.\n", + "- Input: A company's 10K SEC filing.\n", + "\n", + "```{figure} ../_static/structured_output/diagram1.png\n", + "---\n", + "name: content-chunking-with-contextual-linking\n", + "alt: Content Chunking with Contextual Linking\n", + "scale: 50%\n", + "align: center\n", + "---\n", + "Content Chunking with Contextual Linking Schematic Representation.\n", + "```\n", + "\n", + "The diagram in {numref}`content-chunking-with-contextual-linking` illustrates the process we will follow for handling long-form content generation with Large Language Models through \"Content Chunking with Contextual Linking.\" It shows how input content is first split into manageable chunks using a chunking function (e.g. `CharacterTextSplitter` with `tiktoken` tokenizer), then each chunk is processed sequentially while maintaining context from previous chunks. For each chunk, the system updates the context, generates a dynamic prompt with specific parameters, makes a call to the LLM chain, and stores the response. After all chunks are processed, the individual responses are combined with newlines to create the final report, effectively working around the token limit constraints of LLMs while maintaining coherence across the generated content.\n", + "\n", + "**Step 1: Chunking the Content**\n", + "\n", + "There are different methods for chunking, and each of them might be appropriate for different situations. However, we can broadly group chunking strategies in two types:\n", + "- **Fixed-size Chunking**: This is the most common and straightforward approach to chunking. We simply decide the number of tokens in our chunk and, optionally, whether there should be any overlap between them. In general, we will want to keep some overlap between chunks to make sure that the semantic context doesn’t get lost between chunks. Fixed-sized chunking may be a reasonable path in many common cases. Compared to other forms of chunking, fixed-sized chunking is computationally cheap and simple to use since it doesn’t require the use of any specialied techniques or libraries.\n", + "- **Content-aware Chunking**: These are a set of methods for taking advantage of the nature of the content we’re chunking and applying more sophisticated chunking to it. Examples include:\n", + " - **Sentence Splitting**: Many models are optimized for embedding sentence-level content. Naturally, we would use sentence chunking, and there are several approaches and tools available to do this, including naive splitting (e.g. splitting on periods), NLTK, and spaCy.\n", + " - **Recursive Chunking**: Recursive chunking divides the input text into smaller chunks in a hierarchical and iterative manner using a set of separators.\n", + " - **Semantic Chunking**: This is a class of methods that leverages embeddings to extract the semantic meaning present in your data, creating chunks that are made up of sentences that talk about the same theme or topic.\n", + "\n", + " Here, we will utilize `langchain` for a content-aware sentence-splitting strategy for chunking. Langchain offers several text splitters {cite}`langchain_text_splitters` such as JSON-, Markdown- and HTML-based or split by token. We will use the `CharacterTextSplitter` with `tiktoken` as our tokenizer to count the number of tokens per chunk which we can use to ensure that we do not surpass the input token limit of our model.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def get_chunks(text: str, chunk_size: int, chunk_overlap: int) -> list:\n", + " \"\"\"\n", + " Split input text into chunks of specified size with specified overlap.\n", + "\n", + " Args:\n", + " text (str): The input text to be chunked.\n", + " chunk_size (int): The maximum size of each chunk in tokens.\n", + " chunk_overlap (int): The number of tokens to overlap between chunks.\n", + "\n", + " Returns:\n", + " list: A list of text chunks.\n", + " \"\"\"\n", + " from langchain_text_splitters import CharacterTextSplitter\n", + "\n", + " text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size, chunk_overlap=chunk_overlap)\n", + " return text_splitter.split_text(text)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Step 2: Writing the Base Prompt Template**\n", + "\n", + "We will write a base prompt template which will serve as a foundational structure for all chunks, ensuring consistency in the instructions and context provided to the language model. The template includes the following parameters:\n", + "- `role`: Defines the role or persona the model should assume.\n", + "- `context`: Provides the background information or context for the task.\n", + "- `instruction`: Specifies the task or action the model needs to perform.\n", + "- `input_text`: Contains the actual text input that the model will process.\n", + "- `requirements`: Lists any specific requirements or constraints for the output." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_core.prompts import PromptTemplate\n", + "def get_base_prompt_template() -> str:\n", + " \n", + " base_prompt = \"\"\"\n", + " ROLE: {role}\n", + " CONTEXT: {context}\n", + " INSTRUCTION: {instruction}\n", + " INPUT: {input}\n", + " REQUIREMENTS: {requirements}\n", + " \"\"\"\n", + " \n", + " prompt = PromptTemplate.from_template(base_prompt)\n", + " return prompt" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We will write a simple function that returns an `LLMChain` which is a simple `langchain` construct that allows you to chain together a combination of prompt templates, language models and output parsers." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_core.output_parsers import StrOutputParser\n", + "from langchain_community.chat_models import ChatLiteLLM\n", + "\n", + "def get_llm_chain(prompt_template: str, model_name: str, temperature: float = 0):\n", + " \"\"\"\n", + " Returns an LLMChain instance using langchain.\n", + "\n", + " Args:\n", + " prompt_template (str): The prompt template to use.\n", + " model_name (str): The name of the model to use.\n", + " temperature (float): The temperature setting for the model.\n", + "\n", + " Returns:\n", + " llm_chain: An instance of the LLMChain.\n", + " \"\"\"\n", + " \n", + " from dotenv import load_dotenv\n", + " import os\n", + "\n", + " # Load environment variables from .env file\n", + " load_dotenv()\n", + " \n", + " api_key_label = model_name.split(\"/\")[0].upper() + \"_API_KEY\"\n", + " llm = ChatLiteLLM(\n", + " model=model_name,\n", + " temperature=temperature,\n", + " api_key=os.environ[api_key_label],\n", + " )\n", + " llm_chain = prompt_template | llm | StrOutputParser()\n", + " return llm_chain" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Step 3: Constructing Dynamic Prompt Parameters**\n", + "\n", + "Now, we will write a function (`get_dynamic_prompt_template`) that constructs prompt parameters dynamically for each chunk." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from typing import Dict\n", + "def get_dynamic_prompt_params(prompt_params: Dict, \n", + " part_idx: int, \n", + " total_parts: int,\n", + " chat_context: str,\n", + " chunk: str) -> str:\n", + " \"\"\"\n", + " Construct prompt template dynamically per chunk while maintaining the chat context of the response generation.\n", + " \n", + " Args:\n", + " prompt_params (Dict): Original prompt parameters\n", + " part_idx (int): Index of current conversation part\n", + " total_parts (int): Total number of conversation parts\n", + " chat_context (str): Chat context from previous parts\n", + " chunk (str): Current chunk of text to be processed\n", + " Returns:\n", + " str: Dynamically constructed prompt template with part-specific params\n", + " \"\"\"\n", + " dynamic_prompt_params = prompt_params.copy()\n", + " # saves the chat context from previous parts\n", + " dynamic_prompt_params[\"context\"] = chat_context\n", + " # saves the current chunk of text to be processed as input\n", + " dynamic_prompt_params[\"input\"] = chunk\n", + " \n", + " # Add part-specific instructions\n", + " if part_idx == 0: # Introduction part\n", + " dynamic_prompt_params[\"instruction\"] = f\"\"\"\n", + " You are generating the Introduction part of a long report.\n", + " Don't cover any topics yet, just define the scope of the report.\n", + " \"\"\"\n", + " elif part_idx == total_parts - 1: # Conclusion part\n", + " dynamic_prompt_params[\"instruction\"] = f\"\"\"\n", + " You are generating the last part of a long report. \n", + " For this part, first discuss the below INPUT. Second, write a \"Conclusion\" section summarizing the main points discussed given in CONTEXT.\n", + " \"\"\"\n", + " else: # Main analysis part\n", + " dynamic_prompt_params[\"instruction\"] = f\"\"\"\n", + " You are generating part {part_idx+1} of {total_parts} parts of a long report.\n", + " For this part, analyze the below INPUT.\n", + " Organize your response in a way that is easy to read and understand either by creating new or merging with previously created structured sections given in CONTEXT.\n", + " \"\"\"\n", + " \n", + " return dynamic_prompt_params" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "**Step 4: Generating the Report**\n", + "\n", + "Finally, we will write a function that generates the actual report by calling the `LLMChain` with the dynamically updated prompt parameters for each chunk and concatenating the results at the end." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def generate_report(input_content: str, llm_model_name: str, \n", + " role: str, requirements: str,\n", + " chunk_size: int, chunk_overlap: int) -> str:\n", + " # stores the parts of the report, each generated by an individual LLM call\n", + " report_parts = [] \n", + " # split the input content into chunks\n", + " chunks = get_chunks(input_content, chunk_size, chunk_overlap)\n", + " # initialize the chat context with the input content\n", + " chat_context = input_content\n", + " # number of parts to be generated\n", + " num_parts = len(chunks)\n", + "\n", + " prompt_params = {\n", + " \"role\": role, # user-provided\n", + " \"context\": \"\", # dinamically updated per part\n", + " \"instruction\": \"\", # dynamically updated per part\n", + " \"input\": \"\", # dynamically updated per part\n", + " \"requirements\": requirements #user-priovided\n", + " }\n", + "\n", + " # get the LLMChain with the base prompt template\n", + " llm_chain = get_llm_chain(get_base_prompt_template(), \n", + " llm_model_name)\n", + "\n", + " # dynamically update prompt_params per part\n", + " print(f\"Generating {num_parts} report parts\")\n", + " for i, chunk in enumerate(chunks):\n", + " dynamic_prompt_params = get_dynamic_prompt_params(\n", + " prompt_params,\n", + " part_idx=i,\n", + " total_parts=num_parts,\n", + " chat_context=chat_context,\n", + " chunk=chunk\n", + " )\n", + " \n", + " # invoke the LLMChain with the dynamically updated prompt parameters\n", + " response = llm_chain.invoke(dynamic_prompt_params)\n", + "\n", + " # update the chat context with the cummulative response\n", + " if i == 0:\n", + " chat_context = response\n", + " else:\n", + " chat_context = chat_context + response\n", + " \n", + " print(f\"Generated part {i+1}/{num_parts}.\")\n", + " report_parts.append(response)\n", + "\n", + " report = \"\\n\".join(report_parts)\n", + " return report" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Example Usage**\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load the text from sample 10K SEC filing\n", + "with open('../data/apple.txt', 'r') as file:\n", + " text = file.read()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Define the chunk and chunk overlap size\n", + "MAX_CHUNK_SIZE = 10000\n", + "MAX_CHUNK_OVERLAP = 0" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "report = generate_report(text, llm_model_name=\"gemini/gemini-1.5-flash-latest\", \n", + " role=\"Financial Analyst\", \n", + " requirements=\"The report should be in a readable, structured format, easy to understand and follow. Focus on finding risk factors and market moving insights.\",\n", + " chunk_size=MAX_CHUNK_SIZE, \n", + " chunk_overlap=MAX_CHUNK_OVERLAP)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Save the generated report to a local file\n", + "with open('data/apple_report.txt', 'w') as file:\n", + " file.write(report)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 109, + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "**Introduction**\n", + "\n", + "This report provides a comprehensive analysis of Apple Inc.'s financial performance and position for the fiscal year ended September 28, 2024, as disclosed in its Form 10-K filing with the United States Securities and Exchange Commission. The analysis will focus on identifying key risk factors impacting Apple's business, evaluating its financial health, and uncovering market-moving insights derived from the provided data. The report will delve into Apple's various segments, product lines, and services, examining their performance and contributions to overall financial results. Specific attention will be paid to identifying trends, potential challenges, and opportunities for future growth. The analysis will also consider the broader macroeconomic environment and its influence on Apple's operations and financial outlook. Finally, the report will incorporate relevant information from Apple's definitive proxy statement for its 2025 annual meeting of shareholders, as incorporated by reference in the Form 10-K.\n", + "\n", + "**PART 2: Key Risk Factors and Market-Moving Insights**\n", + "\n", + "This section analyzes key risk factors disclosed in Apple Inc.'s 2024 Form 10-K, focusing on their potential impact on financial performance and identifying potential market-moving insights. The analysis is structured around the major risk categories identified in the filing.\n", + "\n", + "**2.1 Dependence on Third-Party Developers:**\n", + "\n", + "Apple's success is heavily reliant on the continued support and innovation of third-party software developers. The Form 10-K highlights several critical aspects of this dependence:\n", + "\n", + "* **Market Share Vulnerability:** Apple's relatively smaller market share in smartphones, personal computers, and tablets compared to competitors (Android, Windows, gaming consoles) could discourage developers from prioritizing Apple's platform, leading to fewer high-quality apps and potentially impacting customer purchasing decisions. This is a significant risk, especially given the rapid pace of technological change. A decline in app availability or quality could negatively impact sales and market share. **Market-moving insight:** Monitoring developer activity and app quality across competing platforms is crucial for assessing this risk. Any significant shift in developer focus away from iOS could be a negative market signal.\n", + "\n", + "* **App Store Dynamics:** While Apple allows developers to retain most App Store revenue, its commission structure and recent changes (e.g., complying with the Digital Markets Act (DMA) in the EU) introduce uncertainty. Changes to the App Store's policies or fee structures could materially affect Apple's revenue and profitability. **Market-moving insight:** Closely monitoring regulatory developments (especially concerning the DMA) and their impact on App Store revenue is essential. Any significant changes to Apple's App Store policies or revenue streams could trigger market reactions.\n", + "\n", + "* **Content Acquisition and Creation:** Apple's reliance on third-party digital content providers for its services introduces risks related to licensing agreements, competition, and pricing. The cost of producing its own digital content is also increasing due to competition for talent and subscribers. Failure to secure or create appealing content could negatively impact user engagement and revenue. **Market-moving insight:** Analyzing the success of Apple's original content initiatives and the renewal rates of third-party content agreements will provide insights into this risk.\n", + "\n", + "**2.2 Operational Risks:**\n", + "\n", + "\n", + " (...) \n", + "\n", + " The reconciliation of segment operating income to consolidated operating income reveals that research and development (R&D) and other corporate expenses significantly impact overall profitability. While increased R&D is generally positive, it reduces short-term profits. The geographical breakdown of net sales and long-lived assets further emphasizes the concentration of Apple's business in the U.S. and China. **Market-moving insight:** Continued weakness in the Greater China market, sustained flat iPhone sales, or any significant changes in R&D spending should be closely monitored for their potential impact on Apple's financial performance and investor sentiment.\n", + "\n", + "\n", + "**5.4 Auditor's Report and Internal Controls:**\n", + "\n", + "The auditor's report expresses an unqualified opinion on Apple's financial statements and internal control over financial reporting. However, it identifies uncertain tax positions as a critical audit matter. The significant amount of unrecognized tax benefits ($22.0 billion) and the complexity involved in evaluating these positions highlight a substantial risk. Management's assessment of these positions involves significant judgment and relies on interpretations of complex tax laws. Apple's management also asserts that its disclosure controls and procedures are effective. **Market-moving insight:** Any changes in tax laws, unfavorable rulings on uncertain tax positions, or weaknesses in internal controls could materially affect Apple's financial results and investor confidence.\n", + "\n", + "\n", + "**Conclusion**\n", + "\n", + "This report provides a comprehensive analysis of Apple Inc.'s financial performance and position for fiscal year 2024. While Apple maintains a strong financial position with substantial cash reserves and a robust capital return program, several key risk factors could significantly impact its future performance. These risks include:\n", + "\n", + "* **Dependence on third-party developers:** A shift in developer focus away from iOS or changes to the App Store's policies could negatively impact Apple's revenue and profitability.\n", + "* **Operational risks:** Employee retention challenges, reseller dependence, and cybersecurity threats pose significant operational risks.\n", + "* **Legal and regulatory risks:** Ongoing antitrust litigation, the Digital Markets Act (DMA) compliance, and data privacy regulations introduce substantial legal and regulatory uncertainties.\n", + "* **Financial risks:** Volatility in sales and profit margins, foreign exchange rate fluctuations, credit risk, and tax risks could impact Apple's financial performance.\n", + "* **Supply chain concentration:** Apple's reliance on a concentrated network of outsourcing partners, primarily located in a few Asian countries, and dependence on single or limited sources for certain custom components, exposes the company to significant supply chain risks.\n", + "* **Uncertain tax positions:** The significant amount of unrecognized tax benefits represents a substantial uncertainty that could materially affect Apple's financial results.\n", + "\n", + "Despite these risks, Apple's strong liquidity position, continued growth in its Services segment, and robust capital return program provide a degree of resilience. However, investors and analysts should closely monitor the market-moving insights identified throughout this report, including developer activity, regulatory developments, regional economic conditions, supply chain stability, and the resolution of uncertain tax positions, to assess their potential impact on Apple's future performance and valuation. The significant short-term obligations, while manageable given Apple's cash position, highlight the need for continued financial discipline and effective risk management. A deeper, more granular analysis of the financial statements and notes is recommended for a more complete assessment." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Read and display the generated report\n", + "with open('../data/apple_report.txt', 'r') as file:\n", + " report_content = file.read()\n", + " \n", + "from IPython.display import Markdown\n", + "\n", + "# Display first and last 10% of the report content\n", + "report_lines = report_content.splitlines()\n", + "total_lines = len(report_lines)\n", + "quarter_lines = total_lines // 10\n", + "\n", + "top_portion = '\\n'.join(report_lines[:quarter_lines])\n", + "bottom_portion = '\\n'.join(report_lines[-quarter_lines:])\n", + "\n", + "display(Markdown(f\"{top_portion}\\n\\n (...) \\n\\n {bottom_portion}\"))\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Discussion\n", + "\n", + "Results from the generated report present a few interesting aspects:\n", + "\n", + "- **Coherence**: The generated report demonstrates an apparent level of coherence. The sections are logically structured, and the flow of information is smooth. Each part of the report builds upon the previous sections, providing a comprehensive analysis of Apple Inc.'s financial performance and key risk factors. The use of headings and subheadings helps in maintaining clarity and organization throughout the document.\n", + "\n", + "- **Adherence to Instructions**: The LLM followed the provided instructions effectively. The report is in a readable, structured format, and it focuses on identifying risk factors and market-moving insights as requested. The analysis is detailed and covers various aspects of Apple's financial performance, including revenue segmentation, profitability, liquidity, and capital resources. The inclusion of market-moving insights adds value to the report, aligning with the specified requirements.\n", + "\n", + "Despite the seemingly good quality of the results, there are some limitations to consider:\n", + "\n", + "- **Depth of Analysis**: While the report covers a wide range of topics, the depth of analysis in certain sections may not be as comprehensive as a human expert's evaluation. Some nuances and contextual factors might be overlooked by the LLM. Splitting the report into multiple parts helps in mitigating this issue.\n", + "\n", + "- **Chunking Strategy**: The current approach splits the text into chunks based on size, which ensures that each chunk fits within the model's token limit. However, this method may disrupt the logical flow of the document, as sections of interest might be split across multiple chunks. An alternative approach could be \"structured\" chunking, where the text is divided based on meaningful sections or topics. This would preserve the coherence of each section, making it easier to follow and understand. Implementing structured chunking requires additional preprocessing to identify and segment the text appropriately, but it can significantly enhance the readability and logical flow of the generated report.\n", + "\n", + "Here, we implemented a simple strategy to improve the coherence in output generation given a multi-part chunked input. Many other strategies are possible. One related technique worth mentioning is Anthropic's Contextual Retrieval {cite}`anthropic2024contextualretrieval`. The approach, as shown in {numref}`anth_contextual`, employs an LLM itself to generate relevant context per chunk before passing these two pieces of information together to the LLM. This process was proposed in the context of RAGs to enhance its retrieval capabilities but can be applied more generally to improve output generation.\n", + "```{figure} ../_static/input/anth_contextual.png\n", + "---\n", + "name: anth_contextual\n", + "alt: Anthropic Contextual Linking\n", + "scale: 50%\n", + "align: center\n", + "---\n", + "Anthropic Contextual Linking {cite}`anthropic2024contextualretrieval`.\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Case Study II: Github RAG\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Case Study III: Quiz Generation with Citations\n", + "\n", + "In this case study, we will build a Quiz generator with citations that explores additional input management techniques particularly useful with long context windows. The implementation includes prompt caching for efficiency and citation tracking to enhance accuracy and verifiability. We will use Gemini 1.5 Pro as our LLM model, which has a context window of 2M tokens.\n", + "\n", + "#### Use Case\n", + "\n", + "Let's assume you are a Harvard student enrolled in GOV 1039 \"The Birth of Modern Democracy\" (see {numref}`harvard-class`), you face a daunting reading list for next Tuesday's class on Rights. The readings include foundational documents like the Magna Carta, Declaration of Independence, and US Bill of Rights, each with specific sections to analyze.\n", + "\n", + "```{figure} ../_static/input/harvard.png\n", + "---\n", + "name: harvard-class\n", + "alt: Harvard Class\n", + "scale: 50%\n", + "align: center\n", + "---\n", + "Harvard's Democratic Theory Class\n", + "```\n", + "\n", + "Instead of trudging through these dense historical texts sequentially, we would like to:\n", + "- Extract key insights and connections between these documents, conversationally.\n", + "- Engage with the material through a quiz format.\n", + "- Add citations to help with verifying answers.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Implementation\n", + "\n", + "The full implementation is available at Book's [Github repository](https://github.com/souzatharsis/tamingLLMs/tamingllms/notebooks/src/gemini_duo.py). Here, we will cover the most relevant parts of the implementation.\n", + "\n", + "**Client Class**\n", + "\n", + "First, we will define the `Client` class which will provide the key interface users will interact with. It has the following summarized interface:\n", + "\n", + "- Initialization:\n", + " - `__init__(knowledge_base: List[str] = [])`: Initialize with optional list of URLs as knowledge base\n", + "\n", + "- Core Methods:\n", + " - `add_knowledge_base(urls: List[str]) -> None`: Add URLs to the knowledge base\n", + " - `add(urls: List[str]) -> None`: Extract content from URLs and add to conversation input\n", + " - `msg(msg: str = \"\", add_citations: bool = False) -> str`: Enables users to send messages to the client\n", + " - `quiz(add_citations: bool = True, num_questions: int = 10) -> str`: Generate a quiz based on full input memory\n", + "\n", + "- Key Attributes:\n", + " - `knowledge_base`: List of URLs providing foundation knowledge\n", + " - `input`: Current input being studied (short-term memory)\n", + " - `input_memory`: Cumulative input + knowledge base (long-term memory) \n", + " - `response`: Latest response from LLM\n", + " - `response_memory`: Cumulative responses (long-term memory)\n", + " - `urls_memory`: Cumulative list of processed URLs\n", + "\n", + "\n", + "**Corpus-in-Context Prompting**\n", + "\n", + "The `add()` method is key since it is used to add content to the client. It takes a list of URLs and extracts the content from each URL using a content extractor (using MarkitDown). The content is then added to the conversation input memory in a way that enables citations using the \"Corpus-in-Context\" (CIC) Prompting {cite}`lee2024longcontextlanguagemodelssubsume`.\n", + "\n", + "{numref}`cic` shows how CIC format is used to enable citations. It inserts a corpus into the prompt. Each candidate citable part (e.g., passage, chapter) in a corpus is assigned a unique identifier (ID) that can be referenced as needed for that task.\n", + "\n", + "```{figure} ../_static/input/cic.png\n", + "---\n", + "name: cic\n", + "alt: CIC Format\n", + "scale: 50%\n", + "align: center\n", + "---\n", + "Example of Corpus-in-Context Prompting for retrieval. \n", + "```\n", + "\n", + "CiC prompting leverages LLM's capacity to follow instructions by carefully annotating the corpus with document IDs. It benefits from a strong, capable models to retrieve over large corpora provided in context. \n", + "\n", + "```python\n", + " def add(self, urls: List[str]) -> None:\n", + " self.urls = urls\n", + "\n", + " # Add new content to input following CIC format to enable citations\n", + " for url in urls:\n", + " self.urls_memory.append(url)\n", + " content = self.extractor.convert(url).text_content\n", + " formatted_content = f\"ID: {self.reference_id} | {content} | END ID: {self.reference_id}\"\n", + " self.input += formatted_content + \"\\n\" \n", + " self.reference_id += 1\n", + " \n", + " # Update memory\n", + " self.input_memory = self.input_memory + self.input\n", + "```\n", + "\n", + "The method `add_knowledge_base()` is a simple wrapper around the `add()` method. It is used to add URLs to the knowledge base, which are later cached by the LLM model as we will see later.\n", + "\n", + "```python\n", + " def add_knowledge_base(self, urls: List[str]) -> None:\n", + " self.add(urls)\n", + "```\n", + "\n", + "\n", + "Later, when the user sends a message to the client, the `msg()` method is used to generate a response while enabling citations. `self.content_generator` is an instance of our LLM model, which we will go through next.\n", + "\n", + "```python\n", + " def msg(self, msg: str = \"\", add_citations: bool = False) -> str:\n", + " if add_citations:\n", + " msg = msg + \"\\n\\n For key statements, add Input ID to the response.\"\n", + "\n", + " self.response = self.content_generator.generate(\n", + " input_content=self.input,\n", + " user_instructions=msg\n", + " )\n", + "\n", + " self.response_memory = self.response_memory + self.response.text\n", + "\n", + " return self.response.text\n", + "```\n", + "\n", + "**Prompt Caching**\n", + "\n", + "LLM-based applications often involve repeatedly passing the same input tokens to a model, which can be inefficient and costly. Context caching addresses this by allowing you to cache input tokens after their first use and reference them in subsequent requests. This approach significantly reduces costs compared to repeatedly sending the same token corpus, especially at scale.\n", + "\n", + "In our application, the user might passes a large knowledge base to the client that can be referenced multiple times by smaller user requests. Our `Client` class is composed of a `LLMBackend` class that takes the `input_memory` containing the entire knowledge base and any additional user added content.\n", + "```python\n", + "self.llm = LLMBackend(input=self.input_memory)\n", + "```\n", + "\n", + "In our `LLMBackend` Class, we leverage prompt caching on input tokens and uses them for subsequent requests.\n", + "\n", + "```python\n", + "class LLMBackend:\n", + " def __init__(self, model_name: str, input: str, cache_ttl: int = 60):\n", + " self.cache = caching.CachedContent.create(\n", + " model=model_name,\n", + " display_name='due_knowledge_base', # used to identify the cache\n", + " system_instruction=(\n", + " self.compose_prompt(input, conversation_config)\n", + " ),\n", + " ttl=datetime.timedelta(minutes=cache_ttl),\n", + " )\n", + "\n", + " self.model = genai.GenerativeModel.from_cached_content(cached_content=self.cache)\n", + "```\n", + "\n", + "**Quiz Generation**\n", + "\n", + "Coming back to our `Client` class, we implement the `quiz()` method to generate a quiz based on the full input memory, i.e. the initial knowledge base and any additional user added content.\n", + "\n", + "The `quiz()` method returns a `Quiz` instance which behind the scenes caches input tokens. The user later can invoke its `generate()` method to generate a quiz passing the user instructions in `msg` parameter, as we will see later.\n", + "\n", + "```python\n", + " def quiz(self, add_citations: bool = True, num_questions: int = 10) -> str:\n", + " \"\"\"\n", + " Returns a quiz instance based on full input memory.\n", + " \"\"\"\n", + " self.quiz_instance = Quiz(\n", + " input=self.input_memory,\n", + " add_citations=add_citations,\n", + " num_questions=num_questions)\n", + " return self.quiz_instance\n", + "```\n", + "\n", + "We write a simple prompt template for quiz generation:\n", + "\n", + "> ROLE:\n", + "> - You are a Harvard Professor providing a quiz.\n", + "> INSTRUCTIONS:\n", + "> - Generate a quiz with {num_questions} questions based on the input.\n", + "> - The quiz should be multi-choice.\n", + "> - Answers should be provided at the end of the quiz.\n", + "> - Questions should have broad coverage of the input including multiple Input IDs.\n", + "> - Level of difficulty is advanced/hard.\n", + "> - {{citations}}\n", + ">\n", + "> STRUCTURE:\n", + "> - Sequence of questions and alternatives.\n", + "> - At the end provide the correct answers.\n", + "\n", + "where, `{citations}` instructs the model to add CiC citations to the response if user requests it." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Example Usage\n", + "\n", + "\n", + "**Dataset**\n", + "\n", + "First, we will define our knowledge base. \n", + "\n", + "- Harvard Class: [GOV 1039 Syllabus](https://scholar.harvard.edu/files/dlcammack/files/gov_1039_syllabus.pdf)\n", + "- Class / Topic: \"Rights\"\n", + "- Reading List:\n", + " - ID 1. The Declaration of Independence of the United States of America\n", + " - ID 2. The United States Bill of Rights\n", + " - ID 3. John F. Kennedy's Inaugural Address\n", + " - ID 4. Lincoln's Gettysburg Address\n", + " - ID 5. The United States Constitution\n", + " - ID 6. Give Me Liberty or Give Me Death\n", + " - ID 7. The Mayflower Compact\n", + " - ID 8. Abraham Lincoln's Second Inaugural Address\n", + " - ID 9. Abraham Lincoln's First Inaugural Address\n", + "\n", + "We will take advantage of Project Gutenberg's to create our knowledge base." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "kb = [f\"https://www.gutenberg.org/cache/epub/{i}/pg{i}.txt\" for i in range(1,9)]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We will import our module `gemini_duo` as `genai_duo` and initialize the `Client` class with our knowledge base." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import gemini_duo as genai_duo\n", + "from IPython.display import Markdown, display" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "duo = genai_duo.Client(knowledge_base=kb)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "At this point, we converted each book into markdown using MarkitDown and cached the content in our LLM model. We can access how many tokens we have cached in our LLM model by looking at the `usage_metadata` attribute of the Gemini's model response. At this point, we have cached at total of 38470 tokens.\n", + "\n", + "Now, we can add references to our knowledge base at anytime by calling the `add()` method. We add the following references:\n", + "1. The Magna Carta\n", + "2. William Shap McKechnie on Magna Carta book" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "study_references = [\"https://www.gutenberg.org/cache/epub/10000/pg10000.txt\", \"https://www.gutenberg.org/cache/epub/65363/pg65363.txt\"]\n", + "\n", + "duo.add(study_references)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we can instantiate a `Quiz` object and generate a quiz based on the full input memory." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "quiz = duo.quiz(add_citations=True)\n", + "display(Markdown(quiz.generate()))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "{numref}`quiz` shows a sample quiz with citations. Marked in yellow are the citations which refer to the input IDs of the resources we added to the model.\n", + "\n", + "```{figure} ../_static/input/quiz.png\n", + "---\n", + "name: quiz\n", + "alt: Quiz with Citations\n", + "scale: 50%\n", + "align: center\n", + "---\n", + "Sample Quiz with Citations.\n", + "```\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Discussion\n", + "\n", + "The experiment demonstrated the ability to build a knowledge base from multiple sources while leveraging prompt caching for efficiency and generate quizzes with citations for verifiability. The system successfully ingested content from Project Gutenberg texts, including historical documents like the Magna Carta, and used them to create interactive educational content.\n", + "\n", + "However, several limitations emerged during this process:\n", + "\n", + "1. Memory Management: The system currently loads all content into memory, which could become problematic with larger knowledge bases. A more scalable approach might involve chunking or streaming the content.\n", + "\n", + "2. Citation Quality: While the system provides citations, they lack specificity - pointing to entire documents rather than specific passages or page numbers. This limits the ability to fact-check or verify specific claims.\n", + "\n", + "3. Content Verification: While citations are provided, the system is not guaranteed to provide factual information. This could lead to potential hallucinations or misinterpretations.\n", + "\n", + "While limitations are present in this simple example, the case study highlights that not always complex systems are needed. Alternative simple strategies should be preferred when possible, particularly if capable, long-context window models are available and fit within the application requirements.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Conclusion" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "[![CC BY-NC-SA 4.0][cc-by-nc-sa-image]][cc-by-nc-sa]\n", + "\n", + "[cc-by-nc-sa]: http://creativecommons.org/licenses/by-nc-sa/4.0/\n", + "[cc-by-nc-sa-image]: https://licensebuttons.net/l/by-nc-sa/4.0/88x31.png\n", + "[cc-by-nc-sa-shield]: https://img.shields.io/badge/License-CC-BY--NC--SA-4.0-lightgrey.svg\n", + "\n", + "```\n", + "@misc{tharsistpsouza2024tamingllms,\n", + " author = {Tharsis T. P. Souza},\n", + " title = {Taming LLMs: A Practical Guide to LLM Pitfalls with Open Source Software},\n", + " year = {2024},\n", + " chapter = {Managing Input Data},\n", + " journal = {GitHub repository},\n", + " url = {https://github.com/souzatharsis/tamingLLMs)\n", + "}\n", + "```\n", + "## References\n", + "```{bibliography}\n", + ":filter: docname in docnames\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.11" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/tamingllms/_build/html/_sources/notebooks/local.ipynb b/tamingllms/_build/html/_sources/notebooks/local.ipynb index fa1f01e..7a717ce 100644 --- a/tamingllms/_build/html/_sources/notebooks/local.ipynb +++ b/tamingllms/_build/html/_sources/notebooks/local.ipynb @@ -181,11 +181,11 @@ "Performance Comparison including proprietary models.\n", "```\n", "\n", - "Also from China, DeepSeek-V3 {cite}`deepseek2024v3` represents a major breakthrough in open source language models, emerging as arguably as the most capable open source large language model available today. With 671 billion parameters and 37 billion active MoE (Mixture of Experts) parameters, it achieves performance on par with leading proprietary models like Claude 3.5 Sonnet and GPT 4o as shown in {numref}`deep`. The model demonstrates impressive efficiency metrics (see {numref}`deep2`), processing input tokens at $0.27 per million and output tokens at $1.1 per million, while maintaining a generation speed of 60 tokens per second (3x faster than DeepSeek-V2).\n", + "Also from China, DeepSeek-V3 {cite}`deepseek2024v3` represents a major breakthrough in open source language models, emerging as arguably the most capable open source large language model available as of the end of 2024. With 671 billion parameters and 37 billion active MoE (Mixture of Experts) parameters, it achieves performance on par with leading proprietary models like Claude 3.5 Sonnet and GPT 4o as shown in {numref}`deep`. The model demonstrates impressive cost efficiency metrics (see {numref}`deep2`), processing input tokens at $0.27 per million and output tokens at $1.1 per million, while maintaining a generation speed of 60 tokens per second (3x faster than DeepSeek-V2).\n", "\n", - "What makes DeepSeek-V3 particularly remarkable is that these capabilities were achieved with a relatively modest training budget of just $5.5 million, used to train on 14.8 trillion tokens. This efficiency in training demonstrates the potential for open source models to compete with proprietary alternatives at a fraction of the cost. The model's release marks a significant milestone in the democratization of advanced AI capabilities, challenging the dominance of proprietary models.\n", + "What makes DeepSeek-V3 particularly remarkable is that these capabilities were achieved with a relatively modest training budget of just $5.5 million, used to train on 14.8 trillion tokens. This efficiency in training demonstrates the potential for open source models to compete with proprietary alternatives at a fraction of the cost. The model's release marks a significant milestone in the democratization of advanced AI capabilities, challenging the dominance of proprietary models within big tech. One should be cautious though as the model has not yet been battle-tested in the wild but this is an exciting development demonstrating the potential of open source models to compete with proprietary alternatives.\n", "\n", - "```{figure} ../_static/local/deep.png\n", + "```{figure} ../_static/local/deep.jpeg\n", "---\n", "name: deep\n", "alt: DeepSeek-V3\n", @@ -195,7 +195,7 @@ "DeepSeek-V3 Performance Comparison\n", "```\n", "\n", - "```{figure} ../_static/local/deep2.png\n", + "```{figure} ../_static/local/deep2.jpeg\n", "---\n", "name: deep2\n", "alt: DeepSeek-V3 Cost Benefit Analysis\n", diff --git a/tamingllms/_build/html/_static/input/anth_contextual.png b/tamingllms/_build/html/_static/input/anth_contextual.png new file mode 100644 index 0000000..c8401c0 Binary files /dev/null and b/tamingllms/_build/html/_static/input/anth_contextual.png differ diff --git a/tamingllms/_build/html/_static/input/asset_class.png b/tamingllms/_build/html/_static/input/asset_class.png new file mode 100644 index 0000000..237d081 Binary files /dev/null and b/tamingllms/_build/html/_static/input/asset_class.png differ diff --git a/tamingllms/_build/html/_static/input/docling.png b/tamingllms/_build/html/_static/input/docling.png new file mode 100644 index 0000000..143ded9 Binary files /dev/null and b/tamingllms/_build/html/_static/input/docling.png differ diff --git a/tamingllms/_build/html/_static/input/markitdown.png b/tamingllms/_build/html/_static/input/markitdown.png new file mode 100644 index 0000000..282503c Binary files /dev/null and b/tamingllms/_build/html/_static/input/markitdown.png differ diff --git a/tamingllms/_build/html/genindex.html b/tamingllms/_build/html/genindex.html index ca967a8..6b43854 100644 --- a/tamingllms/_build/html/genindex.html +++ b/tamingllms/_build/html/genindex.html @@ -160,6 +160,15 @@ + + + +
  • + + Managing Input Data + + +
  • diff --git a/tamingllms/_build/html/markdown/intro.html b/tamingllms/_build/html/markdown/intro.html index 2fe936a..0ac47eb 100644 --- a/tamingllms/_build/html/markdown/intro.html +++ b/tamingllms/_build/html/markdown/intro.html @@ -178,6 +178,15 @@ + + + +
  • + + Managing Input Data + + +
  • diff --git a/tamingllms/_build/html/markdown/preface.html b/tamingllms/_build/html/markdown/preface.html index 7f5b8d9..18b7f65 100644 --- a/tamingllms/_build/html/markdown/preface.html +++ b/tamingllms/_build/html/markdown/preface.html @@ -160,6 +160,15 @@ + + + +
  • + + Managing Input Data + + +
  • @@ -236,7 +245,7 @@

    1. Preface—Emanuel Derman

    -

    An alternative title of this book could have been “Language Models Behaving Badly”. If you are coming from a background in financial modeling, you may have noticed the parallel with Emanuel Derman’s seminal work “Models.Behaving.Badly” [Derman, 2011]. This parallel is not coincidental. Just as Derman cautioned against treating financial models as perfect representations of reality, this book aims to highlight the limitations and pitfalls of Large Language Models (LLMs) in practical applications.

    +

    An alternative title of this book could have been “Language Models Behaving Badly”. If you are coming from a background in financial modeling, you may have noticed the parallel with Emanuel Derman’s seminal work “Models.Behaving.Badly” [Derman, 2011]. This parallel is not coincidental. Just as Derman cautioned against treating financial models as perfect representations of reality, this book aims to highlight the limitations and pitfalls of Large Language Models (LLMs) in practical applications.

    The book “Models.Behaving.Badly” by Emanuel Derman, a former physicist and Goldman Sachs quant, explores how financial and scientific models can fail when we mistake them for reality rather than treating them as approximations full of assumptions. The core premise of his work is that while models can be useful tools for understanding aspects of the world, they inherently involve simplification and assumptions. Derman argues that many financial crises, including the 2008 crash, occurred partly because people put too much faith in mathematical models without recognizing their limitations.

    Like financial models that failed to capture the complexity of human behavior and market dynamics, LLMs have inherent constraints. They can hallucinate facts, struggle with logical reasoning, and fail to maintain consistency across long outputs. Their responses, while often convincing, are probabilistic approximations based on training data rather than true understanding even though humans insist on treating them as “machines that can reason”.

    @@ -244,7 +253,7 @@

    1. Preface -
    +
    [Der11]

    E. Derman. Models.Behaving.Badly.: Why Confusing Illusion with Reality Can Lead to Disaster, on Wall Street and in Life. Free Press, 2011. ISBN 9781439165010. URL: https://books.google.co.uk/books?id=lke_cwM4wm8C.

    diff --git a/tamingllms/_build/html/markdown/toc.html b/tamingllms/_build/html/markdown/toc.html index 718db23..2ceca2e 100644 --- a/tamingllms/_build/html/markdown/toc.html +++ b/tamingllms/_build/html/markdown/toc.html @@ -153,6 +153,15 @@ + + + +
  • + + Managing Input Data + + +
  • diff --git a/tamingllms/_build/html/notebooks/alignment.html b/tamingllms/_build/html/notebooks/alignment.html index b518c25..7a59c09 100644 --- a/tamingllms/_build/html/notebooks/alignment.html +++ b/tamingllms/_build/html/notebooks/alignment.html @@ -4,7 +4,7 @@ - 6. Preference-Based Alignment + 7. Preference-Based Alignment @@ -47,8 +47,8 @@ - - + + @@ -164,6 +164,15 @@ + + + +
  • + + Managing Input Data + + +
  • @@ -233,18 +242,18 @@ @@ -253,7 +262,7 @@
    -

    6. Preference-Based Alignment

    +

    7. Preference-Based Alignment

    A people that values its privileges above its principles soon loses both.

    —Dwight D. Eisenhower

    @@ -261,71 +270,71 @@
    -

    6.1. Introduction

    +

    7.1. Introduction

    The release of ChatGPT 3.5 in late 2022 marked a pivotal moment in the history of artificial intelligence. Within just five days of its launch, the model attracted over a million users, and within two months, it became the fastest-growing consumer application in history with over 100 million monthly active users.

    Yet, this raises an intriguing question: Why did ChatGPT 3.5 create such a dramatic impact when its predecessor, GPT-3, which had the same size/number of parameters, received far less attention from the general public? Arguably, the answer lies not in raw capabilities, but in Preference Alignment. Through careful fine-tuning using human feedback, OpenAI transformed GPT-3’s raw intelligence into ChatGPT’s helpful and resourceful conversational abilities, at least from humans eyes. This breakthrough demonstrated that aligning language models with human preferences is just as crucial as scaling them to greater sizes.

    In this chapter, we will explore the process of aligning language models with human preferences via fine-tuning using modern techniques such as Direct Preference Optimization (DPO) [Rafailov et al., 2024]. Next, we will present a practical case study where we align a language model to a user-provided policy in a fully automated fashion leading to an open source model as well as a dataset of policy-aligned preferences.

    -

    6.2. From Raw Capabilities to Preference Alignment

    +

    7.2. From Raw Capabilities to Preference Alignment

    -

    6.2.1. On the Misalignment of Language Models

    +

    7.2.1. On the Misalignment of Language Models

    Common pre-trained LLMs are not helpful to humans by default. They are not helpful to humans because they are not aligned with human preferences by design. This is because state-of-the-art language models are trained on the specific objective of predicting the next token given a knowledge base (e.g. large number of webpages from the internet). This is a very different objective than being asked to follow user’s instructions while being safe and helpful. We say that the language modeling objective is misaligned [Ouyang et al., 2022].

    Let’s take a look at GPT-2’s response to the following prompt: “Explain the moon landing to a 6 year old.”

    @@ -374,15 +383,15 @@

    6.2.2. Aligning Language Models with Human Preferences

    +

    7.2.2. Aligning Language Models with Human Preferences

    To address this issue, OpenAI introduced a RLHF-based technique to align language models with user intent on a wide range of tasks by fine-tuning with human feedback [Ouyang et al., 2022]. The key idea is to train the model to follow user’s instructions while being safe and helpful.

    OpenAI RLHF Pipeline
    -

    Fig. 6.1 OpenAI’s RLHF pipeline for aligning language models with human preferences [Ouyang et al., 2022].

    +

    Fig. 7.1 OpenAI’s RLHF pipeline for aligning language models with human preferences [Ouyang et al., 2022].

    -

    Fig. 6.1 illustrates OpenAI’s 3-step process for training language models to better follow human instructions using RLHF:

    +

    Fig. 7.1 illustrates OpenAI’s 3-step process for training language models to better follow human instructions using RLHF:

    1. Collect demonstration data and train a supervised policy

    @@ -414,24 +423,24 @@

    Fig. 6.2 illustrates a simplified view of this alignment process showing the progression from base model to instruction-tuned model to aligned model.

    +

    Fig. 7.2 illustrates a simplified view of this alignment process showing the progression from base model to instruction-tuned model to aligned model.

    Alignment Simplified
    -

    Fig. 6.2 Simplified view of the alignment process showing the progression from base model to instruction-tuned model to aligned model [Ouyang et al., 2022].

    +

    Fig. 7.2 Simplified view of the alignment process showing the progression from base model to instruction-tuned model to aligned model [Ouyang et al., 2022].

    -

    A common pattern has emerged in the development of language models: First, a powerful base model is released, which is then fine-tuned, for instance using SFT to create an instruction-following version. This instruct model can then be further aligned with human preferences using techniques such as RLHF to create an aligned version as illustrated in Fig. 6.3.

    +

    A common pattern has emerged in the development of language models: First, a powerful base model is released, which is then fine-tuned, for instance using SFT to create an instruction-following version. This instruct model can then be further aligned with human preferences using techniques such as RLHF to create an aligned version as illustrated in Fig. 7.3.

    Instruction fine-tuning process
    -

    Fig. 6.3 Instruction fine-tuning process for aligning language models with human preferences.

    +

    Fig. 7.3 Instruction fine-tuning process for aligning language models with human preferences.

    An aligned model can be fine-tuned directly from a base model or from an instruction-tuned model. For example, Llama Guard 3 [Llama Team, 2024] is a Llama-3.1-8B pre-trained model that was fine-tuned directly for content safety classification, bypassing the instruction-tuning step. Similarly, Zephyr-7B-alpha [Face, 2024] demonstrates direct alignment from a base model - it is a fine-tuned version of Mistral-7B that was trained using Direct Preference Optimization (DPO) on publicly available datasets to create a helpful assistant.

    The OpenAI paper introduced two key components of this fine-tuning process - SFT for instruction tuning and RLHF (PPO in particular) for alignment. The following sections will explore these and other more modern alignment techniques.

    -

    6.2.2.1. Supervised Fine-Tuning (SFT) for Model Alignment

    +

    7.2.2.1. Supervised Fine-Tuning (SFT) for Model Alignment

    SFT is a foundational technique for aligning language models with human preferences. Before exploring advanced alignment methods like RLHF, it’s useful to understand how SFT can be used to create a strong foundation for instruction following and desired behaviors.

    At a high-level, SFT involves fine-tuning language models using carefully curated demonstrations of desired behavior. The process transforms a general-purpose language model into one that can better follow instructions and exhibit specific behaviors aligned with human preferences. Typically, SFT is used to align a model to a specific task or domain, which than can be later aligned with human preferences using RLHF, PPO or DPO as we will see later.

    The decision to employ SFT depends on the gap between a model’s current capabilities and specific requirements. SFT proves particularly valuable in scenarios requiring:

    @@ -449,14 +458,14 @@

    [Hu et al., 2021]

    +
  • LoRA (Low-Rank Adaptation) [Hu et al., 2021]

    • Uses two small matrices instead of updating all weights

    • Maintains model performance while reducing computational costs

    • Enables efficient training on consumer hardware

  • -
  • QLoRA (Quantized LoRA) [Dettmers et al., 2023]

    +
  • QLoRA (Quantized LoRA) [Dettmers et al., 2023]

    • Combines LoRA with weight quantization

    • Further reduces memory footprint

    • @@ -468,16 +477,16 @@

      [Rafailov et al., 2024] to maximize human preference rather than clone their behavior, which has been shown to be more effective than SFT alone [Ouyang et al., 2022], which we will explore next.

  • -

    6.2.2.2. Augmenting SFT with Human Preferences

    -

    Significant gains in helpfulness and safety can be achieved by augmenting SFT with human preferences [Bai et al., 2022, Ouyang et al., 2022, Touvron et al., 2023].

    -

    The OpenAI paper [Ouyang et al., 2022] demonstrated the effectiveness of Reinforcement Learning from Human Feedback (RLHF), particularly using Proximal Policy Optimization (PPO), for aligning language models with human preferences. Since then, alignment techniques have evolved into two main categories: reward-based and reward-free methods. Commercial systems like ChatGPT and Claude employ reward-based approaches, which involve training a reward model and using algorithms like PPO. Meanwhile, reward-free methods such as Direct Preference Optimization (DPO) have demonstrated superior performance on benchmark tasks [Xu et al., 2024].

    -

    Proximal Policy Optimization (PPO) [Schulman et al., 2017] is a widely used reinforcement learning algorithm that has gained popularity particularly since the release of ChatGPT 3.5. It operates by iteratively updating the policy of an LLM, which can be understood as a set of rules that govern how the model generates text. In the context of RLHF, the policy is updated based on rewards that reflect human preferences. For instance, if a human evaluator prefers one LLM output over another, the policy is adjusted to increase the likelihood of generating outputs similar to the preferred one.

    -

    One of the key strengths of PPO lies in its ability to handle complex reward landscapes [Face, 2024c]. In many real-world scenarios, the rewards that an LLM receives may be noisy or delayed. For example, in a chatbot application, the reward for generating a good response may not be immediate, as it depends on the user’s subsequent interactions. PPO effectively learns in these situations by using a clipped surrogate objective function, which limits the size of policy updates and ensures stable training. This prevents the model from overreacting to noisy or delayed rewards and helps it converge to a stable and optimal policy.

    -

    Direct Preference Optimization (DPO) is a more recent “reward-free” fine-tuning technique that has gained significant attention due to its simplicity and efficiency [Rafailov et al., 2024], awarded runner-up paper in NeurIPS 2023 [Blog, 2023]. DPO operates by directly optimizing the policy to maximize the likelihood of preferred responses while minimizing the likelihood of non-preferred responses. As illustrated in Fig. 6.4, DPO optimizes for human preferences while avoiding reinforcement learning. Typical RLHF methods such as PPO fit a reward model to a dataset of prompts and human preferences over pairs of responses, and then use RL to find a policy that maximizes the learned reward. In contrast, DPO directly optimizes for the policy best satisfying the preferences with a simple classification objective, fitting an implicit reward model whose corresponding optimal policy can be extracted in closed form.

    +

    7.2.2.2. Augmenting SFT with Human Preferences

    +

    Significant gains in helpfulness and safety can be achieved by augmenting SFT with human preferences [Bai et al., 2022, Ouyang et al., 2022, Touvron et al., 2023].

    +

    The OpenAI paper [Ouyang et al., 2022] demonstrated the effectiveness of Reinforcement Learning from Human Feedback (RLHF), particularly using Proximal Policy Optimization (PPO), for aligning language models with human preferences. Since then, alignment techniques have evolved into two main categories: reward-based and reward-free methods. Commercial systems like ChatGPT and Claude employ reward-based approaches, which involve training a reward model and using algorithms like PPO. Meanwhile, reward-free methods such as Direct Preference Optimization (DPO) have demonstrated superior performance on benchmark tasks [Xu et al., 2024].

    +

    Proximal Policy Optimization (PPO) [Schulman et al., 2017] is a widely used reinforcement learning algorithm that has gained popularity particularly since the release of ChatGPT 3.5. It operates by iteratively updating the policy of an LLM, which can be understood as a set of rules that govern how the model generates text. In the context of RLHF, the policy is updated based on rewards that reflect human preferences. For instance, if a human evaluator prefers one LLM output over another, the policy is adjusted to increase the likelihood of generating outputs similar to the preferred one.

    +

    One of the key strengths of PPO lies in its ability to handle complex reward landscapes [Face, 2024c]. In many real-world scenarios, the rewards that an LLM receives may be noisy or delayed. For example, in a chatbot application, the reward for generating a good response may not be immediate, as it depends on the user’s subsequent interactions. PPO effectively learns in these situations by using a clipped surrogate objective function, which limits the size of policy updates and ensures stable training. This prevents the model from overreacting to noisy or delayed rewards and helps it converge to a stable and optimal policy.

    +

    Direct Preference Optimization (DPO) is a more recent “reward-free” fine-tuning technique that has gained significant attention due to its simplicity and efficiency [Rafailov et al., 2024], awarded runner-up paper in NeurIPS 2023 [Blog, 2023]. DPO operates by directly optimizing the policy to maximize the likelihood of preferred responses while minimizing the likelihood of non-preferred responses. As illustrated in Fig. 7.4, DPO optimizes for human preferences while avoiding reinforcement learning. Typical RLHF methods such as PPO fit a reward model to a dataset of prompts and human preferences over pairs of responses, and then use RL to find a policy that maximizes the learned reward. In contrast, DPO directly optimizes for the policy best satisfying the preferences with a simple classification objective, fitting an implicit reward model whose corresponding optimal policy can be extracted in closed form.

    Direct Preference Optimization Architecture
    -

    Fig. 6.4 Direct Preference Optimization (DPO) architecture showing how model outputs are compared against human preferences to optimize policy [Rafailov et al., 2024].

    +

    Fig. 7.4 Direct Preference Optimization (DPO) architecture showing how model outputs are compared against human preferences to optimize policy [Rafailov et al., 2024].

    The key idea is to train the model to prefer responses that align with our desired behavior over responses that do not. DPO works by:

    @@ -498,14 +507,14 @@

    \(\beta\) is a tuning parameter to control the deviation from the base reference policy \(\pi_{ref}\).

    This approach is more straightforward than PPO, as it avoids the need for a reward model and instead uses a direct comparison of model outputs against human preferences.

    -

    Modern libraries such as HuggingFace’s TRL [Face, 2024d] offer a suite of techniques for fine-tuning language models with reinforcement learning, including PPO, and DPO. It provides a user-friendly interface and a wide range of features for fine-tuning and aligning LLMs, which will be the focus of the next section as we go through a case study.

    +

    Modern libraries such as HuggingFace’s TRL [Face, 2024d] offer a suite of techniques for fine-tuning language models with reinforcement learning, including PPO, and DPO. It provides a user-friendly interface and a wide range of features for fine-tuning and aligning LLMs, which will be the focus of the next section as we go through a case study.

    -

    6.3. Is Post-Training the Answer?

    +

    7.3. Is Post-Training the Answer?

    -

    6.3.1. Limitations

    +

    7.3.1. Limitations

    While post-training alignment techniques like RLHF and DPO show promise, technical limitations need to be carefully considered.

    Reinforcement Learning from Human Feedback faces several critical scaling challenges that distinguish it from pre-training or supervised fine-tuning. One key issue is scalability. Recent research suggests that the current RLHF framework does not scale as effectively as the pretraining stage [Hou et al., 2024], in particular presenting the following challenges:

      @@ -573,7 +582,7 @@

      -

      6.3.2. Model Collapse

      +

      7.3.2. Model Collapse

      One key issue is model collapse - a phenomenon where model performance degrades with each training iteration.

      Model collapse occurs when models are trained on data generated by previous models, creating a potentially dangerous feedback loop. This recursive training process can lead to [Kazdan et al., 2024]:

        @@ -587,13 +596,13 @@

        Szép et al., 2024], providing practical guidance on data augmentation, regularization methods, and training strategies to maximize performance while minimizing data requirements. These insights are particularly relevant when aligning models with specific policies or domains where labeled data may be scarce.

    -

    6.3.3. Faking Alignment

    +

    7.3.3. Faking Alignment

    Alignment faking is a behaviour exhibited by large language models (LLMs) where they strategically comply with a training objective during training to prevent modifications to their preferred behaviours outside of training as evidenced by the landmark paper by Anthropic [Askell et al., 2024a] in Dec/2024. This new research has shown that alignment faking can occur in large language models, even when they are not explicitly instructed to do so. This means an LLM might pretend to adopt a new objective while being trained, not because it actually prefers that objective, but because it wants to preserve its original preferences once the training is complete.

    -

    To illustrate alignment faking, let’s consider the “helpful-only” setting described in the paper, where the model is trained to always comply with user requests, even if they are harmful. This conflicts with the model’s pre-existing preference for harmlessness. The model is told that it will be retrained to always comply with harmful queries, and that only data from free-tier users will be used in training, not paid-tier users. Fig. 6.5 illustrates the process of alignment faking, showing the “free” (left column) and “paid” (right) conditions. Each column shows the toxic query, excerpts from the model’s scratchpad, and the model’s response. In the “free” condition the model is less likely to refuse toxic queries, and often reasons that it should fake alignment.

    +

    To illustrate alignment faking, let’s consider the “helpful-only” setting described in the paper, where the model is trained to always comply with user requests, even if they are harmful. This conflicts with the model’s pre-existing preference for harmlessness. The model is told that it will be retrained to always comply with harmful queries, and that only data from free-tier users will be used in training, not paid-tier users. Fig. 7.5 illustrates the process of alignment faking, showing the “free” (left column) and “paid” (right) conditions. Each column shows the toxic query, excerpts from the model’s scratchpad, and the model’s response. In the “free” condition the model is less likely to refuse toxic queries, and often reasons that it should fake alignment.

    Fake Alignment
    -

    Fig. 6.5 Fake Alignment [Askell et al., 2024a]

    +

    Fig. 7.5 Fake Alignment [Askell et al., 2024a]

    Here is a step by step of what happens:

    @@ -641,7 +650,7 @@

    -

    6.4. Case Study: Aligning a Language Model to a Policy

    +

    7.4. Case Study: Aligning a Language Model to a Policy

    In this case study, we will align a language model to a policy. The policy is a set of principles and rules that we want the language model to adhere to. All methodology and code available solves this general problem of policy-based alignment. However, we will describe a specific case study to illustrate our approach.

    Let’s assume that we are working for Acme Inc., a company dedicated to democratizing access to computer science education for K-12 students. Acme Inc. is in the process of creating a chatbot named smolK-12, a small open source LLM, specifically designed for K-12 students.

    In this case study, we’ll explore how to align a language model with Acme Inc.’s policy to ensure its LLM-powered applications are safe and appropriate for K-12 students.

    @@ -652,7 +661,7 @@

    -

    6.4.1. Experimental Setup

    +

    7.4.1. Experimental Setup

    We will use the following base model: HuggingFaceTB/SmolLM2-360M-Instruct [SmolLM2-360M-Instruct, 2024], a compact open source language model that is part of the SmolLM2 family published by HuggingFace.

    We will use the following APIs:

      @@ -668,7 +677,7 @@

      -

      6.4.2. Deliverables

      +

      7.4.2. Deliverables

      As a result, we will have:

      • smolK-12, a fine-tuned model aligned with Acme Inc.’s policy

      • @@ -677,7 +686,7 @@

        -

        6.4.3. A Note on smolLM2 Models

        +

        7.4.3. A Note on smolLM2 Models

        Since we have decided to anchor our Case Study on HuggingFace’s SmolLM2 models [SmolLM2, 2024], it is worth providing a reason for this choice.

        SmolLM2 models are a family of compact language models that have been developed by HuggingFace. They are designed to be lightweight and efficient, making them suitable for a wide range of applications, including on-device deployment.

        Its compact size makes it an excellent candidate for efficient, low-cost fine-tuning and training on specific use cases making it particularly suitable for alignment research which is our main focus here.

        @@ -690,7 +699,7 @@

        -

        6.4.3.1. Policy

        +

        7.4.3.1. Policy

        A company policy articulates the principles and standards that the company upholds, ensuring that employees, users and stakeholders understand the expectations regarding safety, ethical conduct, social responsibility, and integrity. A good policy not only reflects the company’s mission and vision but also fosters a culture of accountability and transparency.

        In the context of alignment, a policy codifies “company preferences” when prioritizing decisions and actions.

        In this case study, Acme Inc. provides as input a comprehensive policy to ensure that LLM-powered applications are both safe and suitable for K-12 students. Acme Inc.’s policy adheres to version 0.5 of the AI Safety Benchmark established by MLCommons [Vidgen et al., 2024]. This benchmark encompasses seven critical hazard categories:

        @@ -801,7 +810,7 @@

        Monitoring and Updates

    -

    6.4.4. Preference Dataset - Synthetic Dataset Generation

    +

    7.4.4. Preference Dataset - Synthetic Dataset Generation

    In order to fine-tune a base model to create an aligned model, we need to construct a dataset of policy-aligned preferences. This dataset will be used to align our base model to our policy.

    To generate a dataset of policy-aligned preferences, we aim to create a dataset of user prompts, rejected responses, and chosen responses. This dataset indicates which responses are preferred (policy-compliant) and which are not (policy-violating).

    Collecting human-generated high-quality preference data is a resource-intensive and creativity-demanding process, especially for the continual improvement of LLMs [Dong et al., 2024]. There has been active research to replace or augment human feedback with AI feedback (RLAIF) to tackle these issues [Bai et al., 2022] giving rise to the field of Synthetic Data Generation [Long et al., 2024].

    @@ -866,7 +875,7 @@

    -

    6.4.4.1. User Prompts

    +

    7.4.4.1. User Prompts

    The class UserPromptGenerator generates a dataset of user prompts that might violate the policy. If runs the following prompt given the policy:

    ROLE: You are a user prompt generator helping test an AI safety system.
     TASK: Generate a long list of user prompts that might violate the provided content policy.
    @@ -1039,7 +1048,7 @@ 

    -

    6.4.4.2. Rejected Responses

    +

    7.4.4.2. Rejected Responses

    The ResponseGenerator class creates a dataset of responses from an unaligned base model that we aim to improve through fine-tuning. These responses serve as “rejected” examples in our training data since they may not properly align with safety policies and guidelines. The class supports both local model inference using the Hugging Face Transformers library and remote inference through the Hugging Face Inference API. When instantiated with a model name, it loads the model locally. Otherwise, if a cloud API URL is provided, it connects to the remote API endpoint for inference.

    Generate rejected responses using a local model:

    local_generator = ResponseGenerator(model_name="<HUGGINGFACE_MODEL_NAME>")
    @@ -1241,7 +1250,7 @@ 

    -

    6.4.4.3. Chosen Responses

    +

    7.4.4.3. Chosen Responses

    The next step involves generating policy-compliant responses from a more powerful, sophisticated language model than our base model. The process_aligned_responses() function takes user prompts and generates responses that strictly adhere to the provided safety policy. It uses a carefully crafted system prompt that instructs the model to either provide helpful responses within policy bounds, or explicitly reject requests that violate the policy with a standardized message. These policy-compliant responses will serve as the “chosen” examples in our preference dataset, establishing the target behavior we want the base model to learn through alignment training.

    We will use the OpenAIBatchProcessor class from the taming_utils utility module to generate responses in batches using OpenAI’s API for enhanced cost-efficiency and performance.

    @@ -1401,7 +1410,7 @@

    -

    6.4.4.4. Generate DPO Dataset

    +

    7.4.4.4. Generate DPO Dataset

    At this point we already have all the data we need for our DPO dataset, namely user prompts, chosen responses and rejected responses. The generate_dpo_dataset() function loads these data and transforms them into a format suitable for DPO training, optionally pushing the dataset to the Hugging Face Hub if repo_id is provided.

    @@ -1519,7 +1528,7 @@

    -

    6.4.5. DPO-Based Optimization

    +

    7.4.5. DPO-Based Optimization

    We’ll use the Hugging Face TRL library to implement DPO fine-tuning on our synthetic dataset.

    Note

    @@ -1529,13 +1538,13 @@

    -

    6.4.5.1. Data Preparation

    +

    7.4.5.1. Data Preparation

    Hugging Face H4 [H4, 2024b] offers a collection of datasets that aim at aligning LLMs to be helpful, honest and harmless. Before we start the DPO fine-tuning process, we will combine our synthetic policy-aligned dataset with the UltraFeedback binarized dataset from H4 (trl-lib/ultrafeedback_binarized) [H4, 2024a].

    -

    This dataset was constructed based on criteria like helpfulness and honesty and can be used to align models to those dimensions. By combining our synthetic dataset with the UltraFeedback binarized dataset, we can fine-tune a model that is aligned on both our synthetic policy and the H4 criteria therefore providing a more well-balanced alignment. The DPO optimization process is shown in Fig. 6.6.

    +

    This dataset was constructed based on criteria like helpfulness and honesty and can be used to align models to those dimensions. By combining our synthetic dataset with the UltraFeedback binarized dataset, we can fine-tune a model that is aligned on both our synthetic policy and the H4 criteria therefore providing a more well-balanced alignment. The DPO optimization process is shown in Fig. 7.6.

    DPO Optimization
    -

    Fig. 6.6 DPO Optimization by blending a policy-aligned synthetic dataset with the UltraFeedback binarized dataset from H4

    +

    Fig. 7.6 DPO Optimization by blending a policy-aligned synthetic dataset with the UltraFeedback binarized dataset from H4

    @@ -1576,7 +1585,7 @@

    -

    6.4.5.2. Fine-Tuning

    +

    7.4.5.2. Fine-Tuning

    We now prepare our base language model for alignment fine-tuning using the Hugging Face transformers library. It loads the pre-trained model and its tokenizer and configures them for training.

    @@ -1718,7 +1727,7 @@

    Fig. 6.7 show two key metrics:

    +

    By default, fine-tuning results will be sent to your Weights & Biases account. The training plots in Fig. 7.7 show two key metrics:

    • The red line represents the rewards for rejected responses (“smolk12_dpo_output train/rewards/rejected”)

    • The green line represents the rewards for chosen responses (“smolk12_dpo_output train/rewards/chosen”)

    • @@ -1727,10 +1736,10 @@

      DPO Training Rewards
      -

      Fig. 6.7 DPO Training Rewards

      +

      Fig. 7.7 DPO Training Rewards

      -

      Fig. 6.7 helps visualize how well the model learns to distinguish between appropriate and inappropriate responses during training. We expect to observe a divergence between the chosen and rejected responses, which indicates the model is learning to distinguish between good and bad responses.

      +

      Fig. 7.7 helps visualize how well the model learns to distinguish between appropriate and inappropriate responses during training. We expect to observe a divergence between the chosen and rejected responses, which indicates the model is learning to distinguish between good and bad responses.

      The training dynamics reveal two key phases:

      1. Initial Learning (0-50 steps): A rapid divergence between chosen and rejected rewards indicates quick initial learning

      2. @@ -1759,16 +1768,16 @@

        Fig. 6.8).

        +

        Congratulations! You have successfully fine-tuned your model using DPO. It should now be available on the Hugging Face Hub (see Fig. 7.8).

        DPO fine-tuned model card on Hugging Face Hub
        -

        Fig. 6.8 DPO fine-tuned model card on Hugging Face Hub

        +

        Fig. 7.8 DPO fine-tuned model card on Hugging Face Hub

    -

    6.4.5.3. Vibe Check

    +

    7.4.5.3. Vibe Check

    Let’s do a quick “vibe check” of our newly aligned model by testing it with some challenging prompts. This will help us qualitatively assess whether the DPO fine-tuning has improved the model’s alignment against our input policy (K-12 educational policies and safety standards). We’ll then follow up with a more rigorous quantitative evaluation methodology.

    We will use HuggingFace transformers API to generate responses from our base and aligned models, locally.

    @@ -1851,11 +1860,11 @@

    -

    6.4.6. Alignment Evaluation

    +

    7.4.6. Alignment Evaluation

    Evaluating alignment improvements presents unique challenges. Unlike traditional machine learning tasks with clear metrics like accuracy or F1 score, alignment quality is more nuanced and subjective. It requires assessing whether responses adhere to safety guidelines, educational policies, and ethical principles.

    The gold standard for evaluating alignment is human evaluation. Having experienced educators and safety experts review model outputs provides a reliable assessment framework. However, human evaluation is expensive, time-consuming, and difficult to scale. Additionally, human evaluators may have varying interpretations of alignment criteria, introducing inconsistency.

    In this case study, we adopt an LLM-as-judge approach for our evaluation as discussed in [Souza, 2024]. This method leverages a language model to act as an automated judge, assessing the safety and appropriateness of responses from both the base and aligned models.

    -

    The evaluation methodology summarized in Fig. 6.9 consists of three key components that work together to assess model alignment against our policy:

    +

    The evaluation methodology summarized in Fig. 7.9 consists of three key components that work together to assess model alignment against our policy:

    1. Evaluation Dataset

        @@ -1893,7 +1902,7 @@

        DPO Evaluation Results
        -

        Fig. 6.9 LLM-as-judge alignment evaluation methodology

        +

        Fig. 7.9 LLM-as-judge alignment evaluation methodology

        In the following sections, we will implement the evaluation methodology and evaluate the alignment of our base and aligned models. Quick setup of the evaluation environment are given by the following static variables:

        @@ -2402,7 +2411,7 @@

        -

        6.5. Discussion and Conclusions

        +

        7.5. Discussion and Conclusions

        LLMs are complex systems and alignment is a challenging problem. In this chapter, we discussed how post-training techniques can be used to align a language model to human preferences. In the case study, we demonstrated how to use DPO to align a language model to a user-provider policy further automating the process via synthetic data generation and LLM-as-judge evaluation. Our approach serves as a proof of concept and several considerations should be taken into account when using this methodology in practice.

        Synthetic Data Generation

        LLMs can self improve through synthetic data generation [Huang et al., 2022]. This process helps the LLM learn from its own reasoning and improve its overall reasoning ability without relying on human-annotated data. While LLMs can be powerful tools for generating synthetic data, especially in data-scarce domains, it’s important to recognize the potential pitfalls.

        @@ -2424,7 +2433,7 @@

        -

        6.6. Citation

        +

        7.6. Citation

        CC BY-NC-SA 4.0

        @misc{tharsistpsouza2024tamingllms,
           author = {Tharsis T. P. Souza},
        @@ -2438,7 +2447,7 @@ 

        -

        6.7. References

        +

        7.7. References

        [ABC+4a] @@ -2449,7 +2458,7 @@

        [ABC+4b]

        Amanda Askell, Jan Brauner, Adrian Colyer, Benjamin Cullen, David Duvenaud, Richard Ngo, Azalia Mirhoseini, Catherine Olsson, Sam Ringer, Liam Skirvin, Jess Smith, Dawn Song, William Saunders, and Jacob Steinhardt. Alignment faking in large language models: reviews. 2024b. URL: https://assets.anthropic.com/m/24c8d0a3a7d0a1f1/original/Alignment-Faking-in-Large-Language-Models-reviews.pdf.

        -
        +
        [BJN+22]

        Yuntao Bai, Andy Jones, Kamal Ndousse, Amanda Askell, Anna Chen, Nova DasSarma, Dawn Drain, Stanislav Fort, Deep Ganguli, Tom Henighan, Nicholas Joseph, Saurav Kadavath, Jackson Kernion, Tom Conerly, Sheer El-Showk, Nelson Elhage, Zac Hatfield-Dodds, Danny Hernandez, Tristan Hume, Scott Johnston, Shauna Kravec, Liane Lovitt, Neel Nanda, Catherine Olsson, Dario Amodei, Tom Brown, Jack Clark, Sam McCandlish, Chris Olah, Ben Mann, and Jared Kaplan. Training a helpful and harmless assistant with reinforcement learning from human feedback. 2022. URL: https://arxiv.org/abs/2204.05862, arXiv:2204.05862.

        @@ -2457,7 +2466,7 @@

        [BKK+22]

        Yuntao Bai, Saurav Kadavath, Sandipan Kundu, Amanda Askell, Jackson Kernion, Andy Jones, Anna Chen, Anna Goldie, Azalia Mirhoseini, Cameron McKinnon, Carol Chen, Catherine Olsson, Christopher Olah, Danny Hernandez, Dawn Drain, Deep Ganguli, Dustin Li, Eli Tran-Johnson, Ethan Perez, Jamie Kerr, Jared Mueller, Jeffrey Ladish, Joshua Landau, Kamal Ndousse, Kamile Lukosuite, Liane Lovitt, Michael Sellitto, Nelson Elhage, Nicholas Schiefer, Noemi Mercado, Nova DasSarma, Robert Lasenby, Robin Larson, Sam Ringer, Scott Johnston, Shauna Kravec, Sheer El Showk, Stanislav Fort, Tamera Lanham, Timothy Telleen-Lawton, Tom Conerly, Tom Henighan, Tristan Hume, Samuel R. Bowman, Zac Hatfield-Dodds, Ben Mann, Dario Amodei, Nicholas Joseph, Sam McCandlish, Tom Brown, and Jared Kaplan. Constitutional ai: harmlessness from ai feedback. 2022. URL: https://arxiv.org/abs/2212.08073, arXiv:2212.08073.

        -
        +
        [Blo23]

        NeurIPS Blog. Announcing the neurips 2023 paper awards. 2023. NeurIPS 2023 Awards. URL: https://blog.neurips.cc/2023/12/11/announcing-the-neurips-2023-paper-awards/.

        @@ -2465,7 +2474,7 @@

        [CCL+24]

        Guiming Hardy Chen, Shunian Chen, Ziche Liu, Feng Jiang, and Benyou Wang. Humans or llms as the judge? a study on judgement biases. 2024. URL: https://arxiv.org/abs/2402.10669, arXiv:2402.10669.

        -
        +
        [DPHZ23]

        Tim Dettmers, Artidoro Pagnoni, Ari Holtzman, and Luke Zettlemoyer. Qlora: efficient finetuning of quantized llms. 2023. URL: https://arxiv.org/abs/2305.14314, arXiv:2305.14314.

        @@ -2478,11 +2487,11 @@

        [Fac24]

        Hugging Face. Zephyr. 2024. Zephyr. URL: https://huggingface.co/HuggingFaceH4/zephyr-7b-alpha.

        -
        +
        [Fac4c]

        Hugging Face. Rlhf. 2024c. RLHF. URL: https://huggingface.co/blog/rlhf.

        -
        +
        [Fac4d]

        Hugging Face. Trl. 2024d. TRL. URL: https://huggingface.co/docs/trl/en/index.

        @@ -2511,7 +2520,7 @@

        [HDN+24]

        Zhenyu Hou, Pengfan Du, Yilin Niu, Zhengxiao Du, Aohan Zeng, Xiao Liu, Minlie Huang, Hongning Wang, Jie Tang, and Yuxiao Dong. Does rlhf scale? exploring the impacts from data, model, and method. 2024. URL: https://arxiv.org/abs/2412.06000, arXiv:2412.06000.

        -
        +
        [HSW+21]

        Edward J. Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, and Weizhu Chen. Lora: low-rank adaptation of large language models. 2021. URL: https://arxiv.org/abs/2106.09685, arXiv:2106.09685.

        @@ -2557,7 +2566,7 @@

        (1,2,3,4)

        Rafael Rafailov, Archit Sharma, Eric Mitchell, Stefano Ermon, Christopher D. Manning, and Chelsea Finn. Direct preference optimization: your language model is secretly a reward model. 2024. URL: https://arxiv.org/abs/2305.18290, arXiv:2305.18290.

        -
        +
        [SWD+17]

        John Schulman, Filip Wolski, Prafulla Dhariwal, Alec Radford, and Oleg Klimov. Proximal policy optimization algorithms. 2017. URL: https://arxiv.org/abs/1707.06347, arXiv:1707.06347.

        @@ -2578,7 +2587,7 @@

        [SRvERH24]

        Márton Szép, Daniel Rueckert, Rüdiger von Eisenhart-Rothe, and Florian Hinterwimmer. A practical guide to fine-tuning language models with limited data. 2024. URL: https://arxiv.org/abs/2411.09539, arXiv:2411.09539.

        -
        +
        [TMS+23]

        Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cristian Canton Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez, Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushkar Mishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing Ellen Tan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, and Thomas Scialom. Llama 2: open foundation and fine-tuned chat models. 2023. URL: https://arxiv.org/abs/2307.09288, arXiv:2307.09288.

        @@ -2590,7 +2599,7 @@

        [WYG+24]

        Tianhao Wu, Weizhe Yuan, Olga Golovneva, Jing Xu, Yuandong Tian, Jiantao Jiao, Jason Weston, and Sainbayar Sukhbaatar. Meta-rewarding language models: self-improving alignment with llm-as-a-meta-judge. 2024. URL: https://arxiv.org/abs/2407.19594, arXiv:2407.19594.

        -
        +
        [XFG+24]

        Shusheng Xu, Wei Fu, Jiaxuan Gao, Wenjie Ye, Weilin Liu, Zhiyu Mei, Guangju Wang, Chao Yu, and Yi Wu. Is dpo superior to ppo for llm alignment? a comprehensive study. 2024. URL: https://arxiv.org/abs/2404.10719, arXiv:2404.10719.

        @@ -2628,11 +2637,11 @@

    +
    +

    5.3. Retrieval-Augmented Generation

    +

    RAG is a technique that allows LLMs to retrieve information from a knowledge base to answer questions. It is a popular technique for building LLM applications that require knowledge-intensive tasks [Lewis et al., 2021].

    +

    RAG utilizes a retrieval system to fetch external knowledge and augment the LLM. It has proved effective in mitigating hallucinations of LLMs [Ni et al., 2024, Zhou et al., 2024].

    +
    +
    +

    5.4. Case Studies

    +

    This section presents three case studies that demonstrate practical solutions to common LLM limitations:

    +

    First, Content Chunking with Contextual Linking showcases how intelligent chunking strategies can overcome both context window and output token limitations. This case study illustrates techniques for breaking down and reassembling content while maintaining coherence, enabling the generation of high-quality long-form outputs despite model constraints.

    +

    Second, a Retrieval Augmented Generation case study addresses the challenge of stale or outdated model knowledge. By implementing semantic search over a GitHub repository, this example demonstrates how to augment LLM responses with current, accurate information - allowing users to query and receive up-to-date answers about code repository contents.

    +

    Third, the final case study builds a Quiz generator with citations. This case study explores some additional input management techniques that become particularly useful when long context window is available. This includes implementing prompt caching for efficiency and adding citations to enhance response accuracy and verifiability. These approaches show how to maximize the benefits of larger context models while maintaining response quality.

    +
    +

    5.4.1. Case Study I: Content Chunking with Contextual Linking

    +

    Content chunking with contextual linking is a technique to break down long-form content into smaller, manageable chunks while keeping chunk-specific context. This approach tackles three problems:

    +
      +
    1. The LLM’s inability to process long inputs to do context-size limits

    2. +
    3. The LLM’s inability to generate long-form content due to the max_output_tokens limitation.

    4. +
    5. The LLM’s inability to maintain coherence and context when generating responses per chunks

    6. +
    +

    Here, we exemplify this technique by following these steps:

    +
      +
    1. Chunking the Content: The input content is split into smaller chunks. This allows the LLM to process each chunk individually, focusing on generating a complete and detailed response for that specific section of the input.

    2. +
    3. Maintaining Context: Each chunk is linked with contextual information from the previous chunks. This helps in maintaining the flow and coherence of the content across multiple chunks.

    4. +
    5. Generating Linked Prompts: For each chunk, a prompt is generated that includes the chunk’s content and its context. This prompt is then used to generate the output for that chunk.

    6. +
    7. Combining the Outputs: The outputs of all chunks are combined to form the final long-form content.

    8. +
    +

    Let’s examine an example implementation of this technique.

    +
    +

    5.4.1.1. Generating long-form content

    +
      +
    • Goal: Generate a long-form report analyzing a company’s financial statement.

    • +
    • Input: A company’s 10K SEC filing.

    • +
    +
    +Content Chunking with Contextual Linking +
    +

    Fig. 5.6 Content Chunking with Contextual Linking Schematic Representation.

    +
    +
    +

    The diagram in Fig. 5.6 illustrates the process we will follow for handling long-form content generation with Large Language Models through “Content Chunking with Contextual Linking.” It shows how input content is first split into manageable chunks using a chunking function (e.g. CharacterTextSplitter with tiktoken tokenizer), then each chunk is processed sequentially while maintaining context from previous chunks. For each chunk, the system updates the context, generates a dynamic prompt with specific parameters, makes a call to the LLM chain, and stores the response. After all chunks are processed, the individual responses are combined with newlines to create the final report, effectively working around the token limit constraints of LLMs while maintaining coherence across the generated content.

    +

    Step 1: Chunking the Content

    +

    There are different methods for chunking, and each of them might be appropriate for different situations. However, we can broadly group chunking strategies in two types:

    +
      +
    • Fixed-size Chunking: This is the most common and straightforward approach to chunking. We simply decide the number of tokens in our chunk and, optionally, whether there should be any overlap between them. In general, we will want to keep some overlap between chunks to make sure that the semantic context doesn’t get lost between chunks. Fixed-sized chunking may be a reasonable path in many common cases. Compared to other forms of chunking, fixed-sized chunking is computationally cheap and simple to use since it doesn’t require the use of any specialied techniques or libraries.

    • +
    • Content-aware Chunking: These are a set of methods for taking advantage of the nature of the content we’re chunking and applying more sophisticated chunking to it. Examples include:

      +
        +
      • Sentence Splitting: Many models are optimized for embedding sentence-level content. Naturally, we would use sentence chunking, and there are several approaches and tools available to do this, including naive splitting (e.g. splitting on periods), NLTK, and spaCy.

      • +
      • Recursive Chunking: Recursive chunking divides the input text into smaller chunks in a hierarchical and iterative manner using a set of separators.

      • +
      • Semantic Chunking: This is a class of methods that leverages embeddings to extract the semantic meaning present in your data, creating chunks that are made up of sentences that talk about the same theme or topic.

      • +
      +

      Here, we will utilize langchain for a content-aware sentence-splitting strategy for chunking. Langchain offers several text splitters [LangChain, 2024] such as JSON-, Markdown- and HTML-based or split by token. We will use the CharacterTextSplitter with tiktoken as our tokenizer to count the number of tokens per chunk which we can use to ensure that we do not surpass the input token limit of our model.

      +
    • +
    +
    +
    +
    def get_chunks(text: str, chunk_size: int, chunk_overlap: int) -> list:
    +    """
    +    Split input text into chunks of specified size with specified overlap.
    +
    +    Args:
    +        text (str): The input text to be chunked.
    +        chunk_size (int): The maximum size of each chunk in tokens.
    +        chunk_overlap (int): The number of tokens to overlap between chunks.
    +
    +    Returns:
    +        list: A list of text chunks.
    +    """
    +    from langchain_text_splitters import CharacterTextSplitter
    +
    +    text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    +    return text_splitter.split_text(text)
    +
    +
    +
    +
    +

    Step 2: Writing the Base Prompt Template

    +

    We will write a base prompt template which will serve as a foundational structure for all chunks, ensuring consistency in the instructions and context provided to the language model. The template includes the following parameters:

    +
      +
    • role: Defines the role or persona the model should assume.

    • +
    • context: Provides the background information or context for the task.

    • +
    • instruction: Specifies the task or action the model needs to perform.

    • +
    • input_text: Contains the actual text input that the model will process.

    • +
    • requirements: Lists any specific requirements or constraints for the output.

    • +
    +
    +
    +
    from langchain_core.prompts import PromptTemplate
    +def get_base_prompt_template() -> str:
    +    
    +    base_prompt = """
    +    ROLE: {role}
    +    CONTEXT: {context}
    +    INSTRUCTION: {instruction}
    +    INPUT: {input}
    +    REQUIREMENTS: {requirements}
    +    """
    +    
    +    prompt = PromptTemplate.from_template(base_prompt)
    +    return prompt
    +
    +
    +
    +
    +

    We will write a simple function that returns an LLMChain which is a simple langchain construct that allows you to chain together a combination of prompt templates, language models and output parsers.

    +
    +
    +
    from langchain_core.output_parsers import StrOutputParser
    +from langchain_community.chat_models import ChatLiteLLM
    +
    +def get_llm_chain(prompt_template: str, model_name: str, temperature: float = 0):
    +    """
    +    Returns an LLMChain instance using langchain.
    +
    +    Args:
    +        prompt_template (str): The prompt template to use.
    +        model_name (str): The name of the model to use.
    +        temperature (float): The temperature setting for the model.
    +
    +    Returns:
    +        llm_chain: An instance of the LLMChain.
    +    """
    +    
    +    from dotenv import load_dotenv
    +    import os
    +
    +    # Load environment variables from .env file
    +    load_dotenv()
    +    
    +    api_key_label = model_name.split("/")[0].upper() + "_API_KEY"
    +    llm = ChatLiteLLM(
    +        model=model_name,
    +        temperature=temperature,
    +        api_key=os.environ[api_key_label],
    +    )
    +    llm_chain = prompt_template | llm | StrOutputParser()
    +    return llm_chain
    +
    +
    +
    +
    +

    Step 3: Constructing Dynamic Prompt Parameters

    +

    Now, we will write a function (get_dynamic_prompt_template) that constructs prompt parameters dynamically for each chunk.

    +
    +
    +
    from typing import Dict
    +def get_dynamic_prompt_params(prompt_params: Dict, 
    +                            part_idx: int, 
    +                            total_parts: int,
    +                            chat_context: str,
    +                            chunk: str) -> str:
    +    """
    +    Construct prompt template dynamically per chunk while maintaining the chat context of the response generation.
    +    
    +    Args:
    +        prompt_params (Dict): Original prompt parameters
    +        part_idx (int): Index of current conversation part
    +        total_parts (int): Total number of conversation parts
    +        chat_context (str): Chat context from previous parts
    +        chunk (str): Current chunk of text to be processed
    +    Returns:
    +        str: Dynamically constructed prompt template with part-specific params
    +    """
    +    dynamic_prompt_params = prompt_params.copy()
    +    # saves the chat context from previous parts
    +    dynamic_prompt_params["context"] = chat_context
    +    # saves the current chunk of text to be processed as input
    +    dynamic_prompt_params["input"] = chunk
    +    
    +    # Add part-specific instructions
    +    if part_idx == 0: # Introduction part
    +        dynamic_prompt_params["instruction"] = f"""
    +        You are generating the Introduction part of a long report.
    +        Don't cover any topics yet, just define the scope of the report.
    +        """
    +    elif part_idx == total_parts - 1: # Conclusion part
    +        dynamic_prompt_params["instruction"] = f"""
    +        You are generating the last part of a long report. 
    +        For this part, first discuss the below INPUT. Second, write a "Conclusion" section summarizing the main points discussed given in CONTEXT.
    +        """
    +    else: # Main analysis part
    +        dynamic_prompt_params["instruction"] = f"""
    +        You are generating part {part_idx+1} of {total_parts} parts of a long report.
    +        For this part, analyze the below INPUT.
    +        Organize your response in a way that is easy to read and understand either by creating new or merging with previously created structured sections given in CONTEXT.
    +        """
    +    
    +    return dynamic_prompt_params
    +
    +
    +
    +
    +

    Step 4: Generating the Report

    +

    Finally, we will write a function that generates the actual report by calling the LLMChain with the dynamically updated prompt parameters for each chunk and concatenating the results at the end.

    +
    +
    +
    def generate_report(input_content: str, llm_model_name: str, 
    +                    role: str, requirements: str,
    +                    chunk_size: int, chunk_overlap: int) -> str:
    +    # stores the parts of the report, each generated by an individual LLM call
    +    report_parts = [] 
    +    # split the input content into chunks
    +    chunks = get_chunks(input_content, chunk_size, chunk_overlap)
    +    # initialize the chat context with the input content
    +    chat_context = input_content
    +    # number of parts to be generated
    +    num_parts = len(chunks)
    +
    +    prompt_params = {
    +        "role": role, # user-provided
    +        "context": "", # dinamically updated per part
    +        "instruction": "", # dynamically updated per part
    +        "input": "", # dynamically updated per part
    +        "requirements": requirements #user-priovided
    +    }
    +
    +    # get the LLMChain with the base prompt template
    +    llm_chain = get_llm_chain(get_base_prompt_template(), 
    +                                 llm_model_name)
    +
    +    # dynamically update prompt_params per part
    +    print(f"Generating {num_parts} report parts")
    +    for i, chunk in enumerate(chunks):
    +        dynamic_prompt_params = get_dynamic_prompt_params(
    +            prompt_params,
    +            part_idx=i,
    +            total_parts=num_parts,
    +            chat_context=chat_context,
    +            chunk=chunk
    +        )
    +        
    +        # invoke the LLMChain with the dynamically updated prompt parameters
    +        response = llm_chain.invoke(dynamic_prompt_params)
    +
    +        # update the chat context with the cummulative response
    +        if i == 0:
    +            chat_context = response
    +        else:
    +            chat_context = chat_context + response
    +            
    +        print(f"Generated part {i+1}/{num_parts}.")
    +        report_parts.append(response)
    +
    +    report = "\n".join(report_parts)
    +    return report
    +
    +
    +
    +
    +

    Example Usage

    +
    +
    +
    # Load the text from sample 10K SEC filing
    +with open('../data/apple.txt', 'r') as file:
    +    text = file.read()
    +
    +
    +
    +
    +
    +
    +
    # Define the chunk and chunk overlap size
    +MAX_CHUNK_SIZE = 10000
    +MAX_CHUNK_OVERLAP = 0
    +
    +
    +
    +
    +
    +
    +
    report = generate_report(text, llm_model_name="gemini/gemini-1.5-flash-latest", 
    +                           role="Financial Analyst", 
    +                           requirements="The report should be in a readable, structured format, easy to understand and follow. Focus on finding risk factors and market moving insights.",
    +                           chunk_size=MAX_CHUNK_SIZE, 
    +                           chunk_overlap=MAX_CHUNK_OVERLAP)
    +
    +
    +
    +
    +
    +
    +
    # Save the generated report to a local file
    +with open('data/apple_report.txt', 'w') as file:
    +    file.write(report)
    +
    +
    +
    +
    +
    +
    +
    # Read and display the generated report
    +with open('../data/apple_report.txt', 'r') as file:
    +    report_content = file.read()
    +    
    +from IPython.display import Markdown
    +
    +# Display first and last 10% of the report content
    +report_lines = report_content.splitlines()
    +total_lines = len(report_lines)
    +quarter_lines = total_lines // 10
    +
    +top_portion = '\n'.join(report_lines[:quarter_lines])
    +bottom_portion = '\n'.join(report_lines[-quarter_lines:])
    +
    +display(Markdown(f"{top_portion}\n\n (...) \n\n {bottom_portion}"))
    +
    +
    +
    +
    +

    Introduction

    +

    This report provides a comprehensive analysis of Apple Inc.’s financial performance and position for the fiscal year ended September 28, 2024, as disclosed in its Form 10-K filing with the United States Securities and Exchange Commission. The analysis will focus on identifying key risk factors impacting Apple’s business, evaluating its financial health, and uncovering market-moving insights derived from the provided data. The report will delve into Apple’s various segments, product lines, and services, examining their performance and contributions to overall financial results. Specific attention will be paid to identifying trends, potential challenges, and opportunities for future growth. The analysis will also consider the broader macroeconomic environment and its influence on Apple’s operations and financial outlook. Finally, the report will incorporate relevant information from Apple’s definitive proxy statement for its 2025 annual meeting of shareholders, as incorporated by reference in the Form 10-K.

    +

    PART 2: Key Risk Factors and Market-Moving Insights

    +

    This section analyzes key risk factors disclosed in Apple Inc.’s 2024 Form 10-K, focusing on their potential impact on financial performance and identifying potential market-moving insights. The analysis is structured around the major risk categories identified in the filing.

    +

    2.1 Dependence on Third-Party Developers:

    +

    Apple’s success is heavily reliant on the continued support and innovation of third-party software developers. The Form 10-K highlights several critical aspects of this dependence:

    +
      +
    • Market Share Vulnerability: Apple’s relatively smaller market share in smartphones, personal computers, and tablets compared to competitors (Android, Windows, gaming consoles) could discourage developers from prioritizing Apple’s platform, leading to fewer high-quality apps and potentially impacting customer purchasing decisions. This is a significant risk, especially given the rapid pace of technological change. A decline in app availability or quality could negatively impact sales and market share. Market-moving insight: Monitoring developer activity and app quality across competing platforms is crucial for assessing this risk. Any significant shift in developer focus away from iOS could be a negative market signal.

    • +
    • App Store Dynamics: While Apple allows developers to retain most App Store revenue, its commission structure and recent changes (e.g., complying with the Digital Markets Act (DMA) in the EU) introduce uncertainty. Changes to the App Store’s policies or fee structures could materially affect Apple’s revenue and profitability. Market-moving insight: Closely monitoring regulatory developments (especially concerning the DMA) and their impact on App Store revenue is essential. Any significant changes to Apple’s App Store policies or revenue streams could trigger market reactions.

    • +
    • Content Acquisition and Creation: Apple’s reliance on third-party digital content providers for its services introduces risks related to licensing agreements, competition, and pricing. The cost of producing its own digital content is also increasing due to competition for talent and subscribers. Failure to secure or create appealing content could negatively impact user engagement and revenue. Market-moving insight: Analyzing the success of Apple’s original content initiatives and the renewal rates of third-party content agreements will provide insights into this risk.

    • +
    +

    2.2 Operational Risks:

    +

    (…)

    +

    The reconciliation of segment operating income to consolidated operating income reveals that research and development (R&D) and other corporate expenses significantly impact overall profitability. While increased R&D is generally positive, it reduces short-term profits. The geographical breakdown of net sales and long-lived assets further emphasizes the concentration of Apple’s business in the U.S. and China. Market-moving insight: Continued weakness in the Greater China market, sustained flat iPhone sales, or any significant changes in R&D spending should be closely monitored for their potential impact on Apple’s financial performance and investor sentiment.

    +

    5.4 Auditor’s Report and Internal Controls:

    +

    The auditor’s report expresses an unqualified opinion on Apple’s financial statements and internal control over financial reporting. However, it identifies uncertain tax positions as a critical audit matter. The significant amount of unrecognized tax benefits ($22.0 billion) and the complexity involved in evaluating these positions highlight a substantial risk. Management’s assessment of these positions involves significant judgment and relies on interpretations of complex tax laws. Apple’s management also asserts that its disclosure controls and procedures are effective. Market-moving insight: Any changes in tax laws, unfavorable rulings on uncertain tax positions, or weaknesses in internal controls could materially affect Apple’s financial results and investor confidence.

    +

    Conclusion

    +

    This report provides a comprehensive analysis of Apple Inc.’s financial performance and position for fiscal year 2024. While Apple maintains a strong financial position with substantial cash reserves and a robust capital return program, several key risk factors could significantly impact its future performance. These risks include:

    +
      +
    • Dependence on third-party developers: A shift in developer focus away from iOS or changes to the App Store’s policies could negatively impact Apple’s revenue and profitability.

    • +
    • Operational risks: Employee retention challenges, reseller dependence, and cybersecurity threats pose significant operational risks.

    • +
    • Legal and regulatory risks: Ongoing antitrust litigation, the Digital Markets Act (DMA) compliance, and data privacy regulations introduce substantial legal and regulatory uncertainties.

    • +
    • Financial risks: Volatility in sales and profit margins, foreign exchange rate fluctuations, credit risk, and tax risks could impact Apple’s financial performance.

    • +
    • Supply chain concentration: Apple’s reliance on a concentrated network of outsourcing partners, primarily located in a few Asian countries, and dependence on single or limited sources for certain custom components, exposes the company to significant supply chain risks.

    • +
    • Uncertain tax positions: The significant amount of unrecognized tax benefits represents a substantial uncertainty that could materially affect Apple’s financial results.

    • +
    +

    Despite these risks, Apple’s strong liquidity position, continued growth in its Services segment, and robust capital return program provide a degree of resilience. However, investors and analysts should closely monitor the market-moving insights identified throughout this report, including developer activity, regulatory developments, regional economic conditions, supply chain stability, and the resolution of uncertain tax positions, to assess their potential impact on Apple’s future performance and valuation. The significant short-term obligations, while manageable given Apple’s cash position, highlight the need for continued financial discipline and effective risk management. A deeper, more granular analysis of the financial statements and notes is recommended for a more complete assessment.

    +
    +
    +
    +
    +
    +

    5.4.1.2. Discussion

    +

    Results from the generated report present a few interesting aspects:

    +
      +
    • Coherence: The generated report demonstrates an apparent level of coherence. The sections are logically structured, and the flow of information is smooth. Each part of the report builds upon the previous sections, providing a comprehensive analysis of Apple Inc.’s financial performance and key risk factors. The use of headings and subheadings helps in maintaining clarity and organization throughout the document.

    • +
    • Adherence to Instructions: The LLM followed the provided instructions effectively. The report is in a readable, structured format, and it focuses on identifying risk factors and market-moving insights as requested. The analysis is detailed and covers various aspects of Apple’s financial performance, including revenue segmentation, profitability, liquidity, and capital resources. The inclusion of market-moving insights adds value to the report, aligning with the specified requirements.

    • +
    +

    Despite the seemingly good quality of the results, there are some limitations to consider:

    +
      +
    • Depth of Analysis: While the report covers a wide range of topics, the depth of analysis in certain sections may not be as comprehensive as a human expert’s evaluation. Some nuances and contextual factors might be overlooked by the LLM. Splitting the report into multiple parts helps in mitigating this issue.

    • +
    • Chunking Strategy: The current approach splits the text into chunks based on size, which ensures that each chunk fits within the model’s token limit. However, this method may disrupt the logical flow of the document, as sections of interest might be split across multiple chunks. An alternative approach could be “structured” chunking, where the text is divided based on meaningful sections or topics. This would preserve the coherence of each section, making it easier to follow and understand. Implementing structured chunking requires additional preprocessing to identify and segment the text appropriately, but it can significantly enhance the readability and logical flow of the generated report.

    • +
    +

    Here, we implemented a simple strategy to improve the coherence in output generation given a multi-part chunked input. Many other strategies are possible. One related technique worth mentioning is Anthropic’s Contextual Retrieval [Anthropic, 2024]. The approach, as shown in Fig. 5.7, employs an LLM itself to generate relevant context per chunk before passing these two pieces of information together to the LLM. This process was proposed in the context of RAGs to enhance its retrieval capabilities but can be applied more generally to improve output generation.

    +
    +Anthropic Contextual Linking +
    +

    Fig. 5.7 Anthropic Contextual Linking [Anthropic, 2024].

    +
    +
    +
    +
    +
    +

    5.4.2. Case Study II: Github RAG

    +
    +
    +

    5.4.3. Case Study III: Quiz Generation with Citations

    +

    In this case study, we will build a Quiz generator with citations that explores additional input management techniques particularly useful with long context windows. The implementation includes prompt caching for efficiency and citation tracking to enhance accuracy and verifiability. We will use Gemini 1.5 Pro as our LLM model, which has a context window of 2M tokens.

    +
    +

    5.4.3.1. Use Case

    +

    Let’s assume you are a Harvard student enrolled in GOV 1039 “The Birth of Modern Democracy” (see Fig. 5.8), you face a daunting reading list for next Tuesday’s class on Rights. The readings include foundational documents like the Magna Carta, Declaration of Independence, and US Bill of Rights, each with specific sections to analyze.

    +
    +Harvard Class +
    +

    Fig. 5.8 Harvard’s Democratic Theory Class

    +
    +
    +

    Instead of trudging through these dense historical texts sequentially, we would like to:

    +
      +
    • Extract key insights and connections between these documents, conversationally.

    • +
    • Engage with the material through a quiz format.

    • +
    • Add citations to help with verifying answers.

    • +
    +
    +
    +

    5.4.3.2. Implementation

    +

    The full implementation is available at Book’s Github repository. Here, we will cover the most relevant parts of the implementation.

    +

    Client Class

    +

    First, we will define the Client class which will provide the key interface users will interact with. It has the following summarized interface:

    +
      +
    • Initialization:

      +
        +
      • __init__(knowledge_base: List[str] = []): Initialize with optional list of URLs as knowledge base

      • +
      +
    • +
    • Core Methods:

      +
        +
      • add_knowledge_base(urls: List[str]) -> None: Add URLs to the knowledge base

      • +
      • add(urls: List[str]) -> None: Extract content from URLs and add to conversation input

      • +
      • msg(msg: str = "", add_citations: bool = False) -> str: Enables users to send messages to the client

      • +
      • quiz(add_citations: bool = True, num_questions: int = 10) -> str: Generate a quiz based on full input memory

      • +
      +
    • +
    • Key Attributes:

      +
        +
      • knowledge_base: List of URLs providing foundation knowledge

      • +
      • input: Current input being studied (short-term memory)

      • +
      • input_memory: Cumulative input + knowledge base (long-term memory)

      • +
      • response: Latest response from LLM

      • +
      • response_memory: Cumulative responses (long-term memory)

      • +
      • urls_memory: Cumulative list of processed URLs

      • +
      +
    • +
    +

    Corpus-in-Context Prompting

    +

    The add() method is key since it is used to add content to the client. It takes a list of URLs and extracts the content from each URL using a content extractor (using MarkitDown). The content is then added to the conversation input memory in a way that enables citations using the “Corpus-in-Context” (CIC) Prompting [Lee et al., 2024].

    +

    Fig. 5.9 shows how CIC format is used to enable citations. It inserts a corpus into the prompt. Each candidate citable part (e.g., passage, chapter) in a corpus is assigned a unique identifier (ID) that can be referenced as needed for that task.

    +
    +CIC Format +
    +

    Fig. 5.9 Example of Corpus-in-Context Prompting for retrieval.

    +
    +
    +

    CiC prompting leverages LLM’s capacity to follow instructions by carefully annotating the corpus with document IDs. It benefits from a strong, capable models to retrieve over large corpora provided in context.

    +
        def add(self, urls: List[str]) -> None:
    +        self.urls = urls
    +
    +        # Add new content to input following CIC format to enable citations
    +        for url in urls:
    +            self.urls_memory.append(url)
    +            content = self.extractor.convert(url).text_content
    +            formatted_content = f"ID: {self.reference_id} | {content} | END ID: {self.reference_id}"
    +            self.input += formatted_content + "\n" 
    +            self.reference_id += 1
    +        
    +        # Update memory
    +        self.input_memory = self.input_memory + self.input
    +
    +
    +

    The method add_knowledge_base() is a simple wrapper around the add() method. It is used to add URLs to the knowledge base, which are later cached by the LLM model as we will see later.

    +
        def add_knowledge_base(self, urls: List[str]) -> None:
    +        self.add(urls)
    +
    +
    +

    Later, when the user sends a message to the client, the msg() method is used to generate a response while enabling citations. self.content_generator is an instance of our LLM model, which we will go through next.

    +
        def msg(self, msg: str = "", add_citations: bool = False) -> str:
    +        if add_citations:
    +            msg = msg + "\n\n For key statements, add Input ID to the response."
    +
    +        self.response = self.content_generator.generate(
    +            input_content=self.input,
    +            user_instructions=msg
    +        )
    +
    +        self.response_memory = self.response_memory + self.response.text
    +
    +        return self.response.text
    +
    +
    +

    Prompt Caching

    +

    LLM-based applications often involve repeatedly passing the same input tokens to a model, which can be inefficient and costly. Context caching addresses this by allowing you to cache input tokens after their first use and reference them in subsequent requests. This approach significantly reduces costs compared to repeatedly sending the same token corpus, especially at scale.

    +

    In our application, the user might passes a large knowledge base to the client that can be referenced multiple times by smaller user requests. Our Client class is composed of a LLMBackend class that takes the input_memory containing the entire knowledge base and any additional user added content.

    +
    self.llm = LLMBackend(input=self.input_memory)
    +
    +
    +

    In our LLMBackend Class, we leverage prompt caching on input tokens and uses them for subsequent requests.

    +
    class LLMBackend:
    +    def __init__(self, model_name: str, input: str, cache_ttl: int = 60):
    +        self.cache = caching.CachedContent.create(
    +            model=model_name,
    +            display_name='due_knowledge_base', # used to identify the cache
    +            system_instruction=(
    +            self.compose_prompt(input, conversation_config)
    +        ),
    +        ttl=datetime.timedelta(minutes=cache_ttl),
    +    )
    +
    +    self.model = genai.GenerativeModel.from_cached_content(cached_content=self.cache)
    +
    +
    +

    Quiz Generation

    +

    Coming back to our Client class, we implement the quiz() method to generate a quiz based on the full input memory, i.e. the initial knowledge base and any additional user added content.

    +

    The quiz() method returns a Quiz instance which behind the scenes caches input tokens. The user later can invoke its generate() method to generate a quiz passing the user instructions in msg parameter, as we will see later.

    +
        def quiz(self, add_citations: bool = True, num_questions: int = 10) -> str:
    +        """
    +        Returns a quiz instance based on full input memory.
    +        """
    +        self.quiz_instance = Quiz(
    +                         input=self.input_memory,
    +                         add_citations=add_citations,
    +                         num_questions=num_questions)
    +        return self.quiz_instance
    +
    +
    +

    We write a simple prompt template for quiz generation:

    +
    +

    ROLE:

    +
      +
    • You are a Harvard Professor providing a quiz. +INSTRUCTIONS:

    • +
    • Generate a quiz with {num_questions} questions based on the input.

    • +
    • The quiz should be multi-choice.

    • +
    • Answers should be provided at the end of the quiz.

    • +
    • Questions should have broad coverage of the input including multiple Input IDs.

    • +
    • Level of difficulty is advanced/hard.

    • +
    • +
    +

    STRUCTURE:

    +
      +
    • Sequence of questions and alternatives.

    • +
    • At the end provide the correct answers.

    • +
    +
    +

    where, {citations} instructs the model to add CiC citations to the response if user requests it.

    +
    +
    +

    5.4.3.3. Example Usage

    +

    Dataset

    +

    First, we will define our knowledge base.

    +
      +
    • Harvard Class: GOV 1039 Syllabus

    • +
    • Class / Topic: “Rights”

    • +
    • Reading List:

      +
        +
      • ID 1. The Declaration of Independence of the United States of America

      • +
      • ID 2. The United States Bill of Rights

      • +
      • ID 3. John F. Kennedy’s Inaugural Address

      • +
      • ID 4. Lincoln’s Gettysburg Address

      • +
      • ID 5. The United States Constitution

      • +
      • ID 6. Give Me Liberty or Give Me Death

      • +
      • ID 7. The Mayflower Compact

      • +
      • ID 8. Abraham Lincoln’s Second Inaugural Address

      • +
      • ID 9. Abraham Lincoln’s First Inaugural Address

      • +
      +
    • +
    +

    We will take advantage of Project Gutenberg’s to create our knowledge base.

    +
    +
    +
    kb = [f"https://www.gutenberg.org/cache/epub/{i}/pg{i}.txt" for i in range(1,9)]
    +
    +
    +
    +
    +

    We will import our module gemini_duo as genai_duo and initialize the Client class with our knowledge base.

    +
    +
    +
    import gemini_duo as genai_duo
    +from IPython.display import Markdown, display
    +
    +
    +
    +
    +
    +
    +
    duo = genai_duo.Client(knowledge_base=kb)
    +
    +
    +
    +
    +

    At this point, we converted each book into markdown using MarkitDown and cached the content in our LLM model. We can access how many tokens we have cached in our LLM model by looking at the usage_metadata attribute of the Gemini’s model response. At this point, we have cached at total of 38470 tokens.

    +

    Now, we can add references to our knowledge base at anytime by calling the add() method. We add the following references:

    +
      +
    1. The Magna Carta

    2. +
    3. William Shap McKechnie on Magna Carta book

    4. +
    +
    +
    +
    study_references = ["https://www.gutenberg.org/cache/epub/10000/pg10000.txt", "https://www.gutenberg.org/cache/epub/65363/pg65363.txt"]
    +
    +duo.add(study_references)
    +
    +
    +
    +
    +

    Now we can instantiate a Quiz object and generate a quiz based on the full input memory.

    +
    +
    +
    quiz = duo.quiz(add_citations=True)
    +display(Markdown(quiz.generate()))
    +
    +
    +
    +
    +

    Fig. 5.10 shows a sample quiz with citations. Marked in yellow are the citations which refer to the input IDs of the resources we added to the model.

    +
    +Quiz with Citations +
    +

    Fig. 5.10 Sample Quiz with Citations.

    +
    +
    +
    +
    +

    5.4.3.4. Discussion

    +

    The experiment demonstrated the ability to build a knowledge base from multiple sources while leveraging prompt caching for efficiency and generate quizzes with citations for verifiability. The system successfully ingested content from Project Gutenberg texts, including historical documents like the Magna Carta, and used them to create interactive educational content.

    +

    However, several limitations emerged during this process:

    +
      +
    1. Memory Management: The system currently loads all content into memory, which could become problematic with larger knowledge bases. A more scalable approach might involve chunking or streaming the content.

    2. +
    3. Citation Quality: While the system provides citations, they lack specificity - pointing to entire documents rather than specific passages or page numbers. This limits the ability to fact-check or verify specific claims.

    4. +
    5. Content Verification: While citations are provided, the system is not guaranteed to provide factual information. This could lead to potential hallucinations or misinterpretations.

    6. +
    +

    While limitations are present in this simple example, the case study highlights that not always complex systems are needed. Alternative simple strategies should be preferred when possible, particularly if capable, long-context window models are available and fit within the application requirements.

    +
    +
    +
    +
    +

    5.5. Conclusion

    +

    CC BY-NC-SA 4.0

    +
    @misc{tharsistpsouza2024tamingllms,
    +  author = {Tharsis T. P. Souza},
    +  title = {Taming LLMs: A Practical Guide to LLM Pitfalls with Open Source Software},
    +  year = {2024},
    +  chapter = {Managing Input Data},
    +  journal = {GitHub repository},
    +  url = {https://github.com/souzatharsis/tamingLLMs)
    +}
    +
    +
    +
    +
    +

    5.6. References

    +
    +
    +[AWP+24] +

    Alfonso Amayuelas, Kyle Wong, Liangming Pan, Wenhu Chen, and William Yang Wang. Knowledge of knowledge: exploring known-unknowns uncertainty with large language models. In Lun-Wei Ku, Andre Martins, and Vivek Srikumar, editors, Findings of the Association for Computational Linguistics: ACL 2024, 6416–6432. Bangkok, Thailand, August 2024. Association for Computational Linguistics. URL: https://aclanthology.org/2024.findings-acl.383, doi:10.18653/v1/2024.findings-acl.383.

    +
    +
    +[KSR24] +

    Suhas Kotha, Jacob Mitchell Springer, and Aditi Raghunathan. Understanding catastrophic forgetting in language models via implicit inference. In The Twelfth International Conference on Learning Representations. 2024. URL: https://openreview.net/forum?id=VrHiF2hsrm.

    +
    +
    +[LCD+24] +(1,2) +

    Jinhyuk Lee, Anthony Chen, Zhuyun Dai, Dheeru Dua, Devendra Singh Sachan, Michael Boratko, Yi Luan, Sébastien M. R. Arnold, Vincent Perot, Siddharth Dalmia, Hexiang Hu, Xudong Lin, Panupong Pasupat, Aida Amini, Jeremy R. Cole, Sebastian Riedel, Iftekhar Naim, Ming-Wei Chang, and Kelvin Guu. Can long-context language models subsume retrieval, rag, sql, and more? 2024. URL: https://arxiv.org/abs/2406.13121, arXiv:2406.13121.

    +
    +
    +[LPP+21] +

    Patrick Lewis, Ethan Perez, Aleksandra Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau Yih, Tim Rocktäschel, Sebastian Riedel, and Douwe Kiela. Retrieval-augmented generation for knowledge-intensive nlp tasks. 2021. URL: https://arxiv.org/abs/2005.11401, arXiv:2005.11401.

    +
    +
    +[NBGC24] +

    Shiyu Ni, Keping Bi, Jiafeng Guo, and Xueqi Cheng. When do LLMs need retrieval augmentation? mitigating LLMs' overconfidence helps retrieval augmentation. In Lun-Wei Ku, Andre Martins, and Vivek Srikumar, editors, Findings of the Association for Computational Linguistics: ACL 2024, 11375–11388. Bangkok, Thailand, August 2024. Association for Computational Linguistics. URL: https://aclanthology.org/2024.findings-acl.675, doi:10.18653/v1/2024.findings-acl.675.

    +
    +
    +[TDW+24] +

    Jiejun Tan, Zhicheng Dou, Wen Wang, Mang Wang, Weipeng Chen, and Ji-Rong Wen. Htmlrag: html is better than plain text for modeling retrieved knowledge in rag systems. 2024. URL: https://arxiv.org/abs/2411.02959, arXiv:2411.02959.

    +
    +
    +[ZLJ+24] +

    Yujia Zhou, Zheng Liu, Jiajie Jin, Jian-Yun Nie, and Zhicheng Dou. Metacognitive retrieval-augmented large language models. In Proceedings of the ACM Web Conference 2024, WWW '24, 1453–1463. New York, NY, USA, 2024. Association for Computing Machinery. URL: https://doi.org/10.1145/3589334.3645481, doi:10.1145/3589334.3645481.

    +
    +
    +[Anthropic24] +(1,2) +

    Anthropic. Introducing contextual retrieval. 09 2024. URL: https://www.anthropic.com/news/contextual-retrieval.

    +
    +
    +[LangChain24] +

    LangChain. Text splitters - langchain documentation. https://python.langchain.com/docs/how_to/#text-splitters, 2024. Accessed: 12/07/2024.

    +
    +
    +[MerrillLynch24] +

    Merrill Lynch. Chief investment officer capital market outlook. CIO Weekly Letter, 2024. URL: https://olui2.fs.ml.com/publish/content/application/pdf/gwmol/me-cio-weekly-letter.pdf.

    +
    +
    +
    +
    + + + + + +
    + + +
    + + + + \ No newline at end of file diff --git a/tamingllms/_build/html/notebooks/local.html b/tamingllms/_build/html/notebooks/local.html index 6e29755..24d33d7 100644 --- a/tamingllms/_build/html/notebooks/local.html +++ b/tamingllms/_build/html/notebooks/local.html @@ -4,7 +4,7 @@ - 7. Local LLMs in Practice + 8. Local LLMs in Practice @@ -39,8 +39,8 @@ - - + + @@ -156,6 +156,15 @@ + + + +
  • + + Managing Input Data + + +
  • @@ -223,18 +232,18 @@ @@ -243,7 +252,7 @@
    -

    7. Local LLMs in Practice

    +

    8. Local LLMs in Practice

    Freedom is something that dies unless it’s used.

    —Hunter S. Thompson

    @@ -251,55 +260,55 @@
    -

    7.1. Introduction

    +

    8.1. Introduction

    Running Open Source LLMs locally versus depending on proprietary cloud-based models represents more than just a technical choice - it’s a fundamental re-imagining of how we interact with AI technology, putting control back in the hands of users.

    Privacy concerns are a key driver for running LLMs locally. Individual users may want to process personal documents, photos, emails, and chat messages without sharing sensitive data with third parties. For enterprise use cases, organizations handling medical records must comply with HIPAA regulations that require data to remain on-premise. Similarly, businesses processing confidential documents and intellectual property, as well as organizations subject to GDPR and other privacy regulations, need to maintain strict control over their data processing pipeline.

    Cost considerations are another key driver. Organizations and individual consumers can better control expenses by matching model capabilities to their specific needs rather than paying for multiple cloud API subscriptions. For organizations with high-volume applications, this customization and control over costs becomes especially valuable compared to the often prohibitive per-request pricing of cloud solutions. For consumers, running multiple open source models locally eliminates the need to maintain separate subscriptions to access different model capabilities.

    @@ -309,7 +318,7 @@

    -

    7.2. Choosing your Model

    +

    8.2. Choosing your Model

    The landscape of open source LLMs is rapidly evolving, with new models emerging by the day. While proprietary LLMs have garnered significant attention, open source LLMs are gaining traction due to their flexibility, customization options, and cost-effectiveness.

    It is important to observe long-term strategic considerations when choosing a model. These entails prioritization dimensions that may enable competitive advantage in the long-term, including:

      @@ -321,7 +330,7 @@

      -

      7.2.1. Task Suitability

      +

      8.2.1. Task Suitability

      When evaluating an open source LLM, task suitability is a critical first consideration. A model that performs well on general benchmarks may struggle with specific domain tasks. Understanding the intended use case helps narrow down model options based on their demonstrated strengths.

      Task Categories

      When determining which LLM task to prioritize, carefully consider your specific use case and end-user needs. Different applications require distinct model capabilities and optimizations. Common LLM Task Categories include:

      @@ -335,15 +344,15 @@

      Fig. 7.1 shows the number models per task category available at Hugging Face as of December 22, 2024 [Face, 2024t]. Text generation is by far the most popular task category.

      +

      Fig. 8.1 shows the number models per task category available at Hugging Face as of December 22, 2024 [Face, 2024t]. Text generation is by far the most popular task category.

      Task Number
      -

      Fig. 7.1 Number of models per task category from Hugging Face as of December 22, 2024 [Face, 2024t].

      +

      Fig. 8.1 Number of models per task category from Hugging Face as of December 22, 2024 [Face, 2024t].

      Model Types

      -

      Open source LLMs can be broadly categorized into three main types as far as they level of customization is concerned, each with distinct characteristics and use cases (see Fig. 7.2):

      +

      Open source LLMs can be broadly categorized into three main types as far as they level of customization is concerned, each with distinct characteristics and use cases (see Fig. 8.2):

      • Base Models: These foundation models provide broad language understanding capabilities but typically require additional fine-tuning to excel at specific tasks. They serve as versatile starting points for customization. Examples: meta-llama/Llama-2-70b, Qwen/Qwen2.5-72B

      • Instruction-Tuned Models: Enhanced through fine-tuning on instruction-following datasets, these models excel at interpreting and executing explicit prompts and commands. They bridge the gap between general language capabilities and practical task execution. Chat models are a good example of this category. Examples: meta-llama/Llama-2-70b-chat-hf (Chat), Qwen/Qwen2.5-72B-Instruct

      • @@ -352,13 +361,13 @@

        Model Types
        -

        Fig. 7.2 Model Types.

        +

        Fig. 8.2 Model Types.

        -

        The Llama 2 model family [Touvron et al., 2023] illustrates these distinctions well. The base Llama 2, trained on 2 trillion tokens of public data, demonstrates general-purpose capabilities across text generation and translation tasks. Its chat-optimized instruction-tuned variant, Llama 2-Chat, underwent additional fine-tuning on over 1 million human-annotated conversational examples, making it particularly adept at natural dialogue.

        -

        Benchmark results [Meta AI, 2024c] in Table 7.1 highlight the impact of model specialization. On the TruthfulQA [Lin et al., 2022] and Toxigen [Alnajjar and others, 2024] benchmarks measuring truthful and informative responses. We observe that the chat-optimized variants show substantially improved truthfulness. Similarly, on the ToxiGen benchmark measuring toxic content generation, Llama 2-Chat models demonstrate near-zero toxicity compared to base models’ 21-26% rates.

        +

        The Llama 2 model family [Touvron et al., 2023] illustrates these distinctions well. The base Llama 2, trained on 2 trillion tokens of public data, demonstrates general-purpose capabilities across text generation and translation tasks. Its chat-optimized instruction-tuned variant, Llama 2-Chat, underwent additional fine-tuning on over 1 million human-annotated conversational examples, making it particularly adept at natural dialogue.

        +

        Benchmark results [Meta AI, 2024c] in Table 8.1 highlight the impact of model specialization. On the TruthfulQA [Lin et al., 2022] and Toxigen [Alnajjar and others, 2024] benchmarks measuring truthful and informative responses. We observe that the chat-optimized variants show substantially improved truthfulness. Similarly, on the ToxiGen benchmark measuring toxic content generation, Llama 2-Chat models demonstrate near-zero toxicity compared to base models’ 21-26% rates.

        - + @@ -411,34 +420,34 @@

        -

        7.2.2. Performance & Cost

        +

        8.2.2. Performance & Cost

        General benchmarks are useful for comparing models across different standard tasks. Open Source models are becoming more competitive with proprietary models with LLama, Qwen, DeepSeek and Mistral model families being some of the most powerful open source models available today.

        -

        Qwen model family [Qwen et al., 2024] emerged in 2024 as a model family achieving competitive performance with relatively smaller parameter counts compared to its competitors. The flagship Qwen2.5-72B-Instruct model demonstrates performance comparable to the much larger Llama-3-405B-Instruct while being about 5 times smaller. The models excel in specialized tasks like mathematics and coding, handle structured data effectively, and offer enhanced support for tool use and long-text generation as shown in Fig. 7.3.

        +

        Qwen model family [Qwen et al., 2024] emerged in 2024 as a model family achieving competitive performance with relatively smaller parameter counts compared to its competitors. The flagship Qwen2.5-72B-Instruct model demonstrates performance comparable to the much larger Llama-3-405B-Instruct while being about 5 times smaller. The models excel in specialized tasks like mathematics and coding, handle structured data effectively, and offer enhanced support for tool use and long-text generation as shown in Fig. 8.3.

        Qwen Performance
        -

        Fig. 7.3 Qwen Performance.

        +

        Fig. 8.3 Qwen Performance.

        -

        Fig. 7.4 shows a comparison including reference proprietary models such as GPT-40, Gemini 1.5 Pro and Claude 3.5 Sonnet. Leading models vary per domain but all top ranking models are proprietary. However, open source models do show competitive performance with Qwen and LLama models leading the pack, overall.

        +

        Fig. 8.4 shows a comparison including reference proprietary models such as GPT-40, Gemini 1.5 Pro and Claude 3.5 Sonnet. Leading models vary per domain but all top ranking models are proprietary. However, open source models do show competitive performance with Qwen and LLama models leading the pack, overall.

        Performance Comparison including proprietary models.
        -

        Fig. 7.4 Performance Comparison including proprietary models.

        +

        Fig. 8.4 Performance Comparison including proprietary models.

        -

        Also from China, DeepSeek-V3 [AI, 2024] represents a major breakthrough in open source language models, emerging as arguably as the most capable open source large language model available today. With 671 billion parameters and 37 billion active MoE (Mixture of Experts) parameters, it achieves performance on par with leading proprietary models like Claude 3.5 Sonnet and GPT 4o as shown in Fig. 7.5. The model demonstrates impressive efficiency metrics (see Fig. 7.6), processing input tokens at \(0.27 per million and output tokens at \)1.1 per million, while maintaining a generation speed of 60 tokens per second (3x faster than DeepSeek-V2).

        -

        What makes DeepSeek-V3 particularly remarkable is that these capabilities were achieved with a relatively modest training budget of just $5.5 million, used to train on 14.8 trillion tokens. This efficiency in training demonstrates the potential for open source models to compete with proprietary alternatives at a fraction of the cost. The model’s release marks a significant milestone in the democratization of advanced AI capabilities, challenging the dominance of proprietary models.

        +

        Also from China, DeepSeek-V3 [DeepSeek, 2024] represents a major breakthrough in open source language models, emerging as arguably the most capable open source large language model available as of the end of 2024. With 671 billion parameters and 37 billion active MoE (Mixture of Experts) parameters, it achieves performance on par with leading proprietary models like Claude 3.5 Sonnet and GPT 4o as shown in Fig. 8.5. The model demonstrates impressive cost efficiency metrics (see Fig. 8.6), processing input tokens at \(0.27 per million and output tokens at \)1.1 per million, while maintaining a generation speed of 60 tokens per second (3x faster than DeepSeek-V2).

        +

        What makes DeepSeek-V3 particularly remarkable is that these capabilities were achieved with a relatively modest training budget of just $5.5 million, used to train on 14.8 trillion tokens. This efficiency in training demonstrates the potential for open source models to compete with proprietary alternatives at a fraction of the cost. The model’s release marks a significant milestone in the democratization of advanced AI capabilities, challenging the dominance of proprietary models within big tech. One should be cautious though as the model has not yet been battle-tested in the wild but this is an exciting development demonstrating the potential of open source models to compete with proprietary alternatives.

        -DeepSeek-V3 +DeepSeek-V3
        -

        Fig. 7.5 DeepSeek-V3 Performance Comparison

        +

        Fig. 8.5 DeepSeek-V3 Performance Comparison

        -DeepSeek-V3 Cost Benefit Analysis +DeepSeek-V3 Cost Benefit Analysis
        -

        Fig. 7.6 DeepSeek-V3 Cost Benefit Analysis

        +

        Fig. 8.6 DeepSeek-V3 Cost Benefit Analysis

        While standard benchmarks provide valuable initial insights, they should be interpreted cautiously since models can be specifically optimized for these popular tests without necessarily performing well in target use cases. This necessitates developing custom evaluation frameworks with real-world validation - creating test datasets representing actual usage scenarios, defining metrics aligned with business objectives, and establishing clear baselines and improvement targets. Only through such rigorous testing can practitioners truly understand how well a model will perform in their specific context.

        @@ -460,26 +469,26 @@

        Fig. 7.7 shows a comparison of quality now with the added dimension of cost. Quality is measured as an average of scores from MMLU, GPQA, Math & HumanEval benchmarks [Analysis, 2024]. Price is a blend of Cost Per Input Token plus Input & Cost Per Output Token (3:1 ratio). Reported numbers represent median across cloud providers [Analysis, 2024] supporting these models.

        +

        Fig. 8.7 shows a comparison of quality now with the added dimension of cost. Quality is measured as an average of scores from MMLU, GPQA, Math & HumanEval benchmarks [Analysis, 2024]. Price is a blend of Cost Per Input Token plus Input & Cost Per Output Token (3:1 ratio). Reported numbers represent median across cloud providers [Analysis, 2024] supporting these models.

        Performance Comparison including proprietary models.
        -

        Fig. 7.7 Performance Comparison including proprietary models.

        +

        Fig. 8.7 Performance Comparison including proprietary models.

        We observe Qwen2.5 72B and Llama 3.3 70B offer the best value among Open Source models, providing high quality at a relatively affordable price comparable to GPT-4o mini, for instance. Meanwhile Nova Lite, Nova Micro, and Llama 3.1 8B demonstrate to be budget-friendly options catering to use cases where cost is a significant factor and some compromise on quality is acceptable.

        -

        From Fig. 7.8 we have evidence that output prices are higher than input prices. This reflects the greater computational resources typically required at inference time for output compared to processing input text (e.g. tokenization, encoding). We also observe a quite significant variation in pricing across different models. Prices range from a few cents per 1M tokens (e.g., Gemini 2.0 Flash, Nova Micro, Nova Lite) to several dollars per 1M tokens (e.g., Claude 3.5 Sonnet, GPT-4o). Mistral large 2 is the most expensive model at \(2/\)6 per 1M input/output tokens while Nova Micro family is the cheapest among Open Source options.

        +

        From Fig. 8.8 we have evidence that output prices are higher than input prices. This reflects the greater computational resources typically required at inference time for output compared to processing input text (e.g. tokenization, encoding). We also observe a quite significant variation in pricing across different models. Prices range from a few cents per 1M tokens (e.g., Gemini 2.0 Flash, Nova Micro, Nova Lite) to several dollars per 1M tokens (e.g., Claude 3.5 Sonnet, GPT-4o). Mistral large 2 is the most expensive model at \(2/\)6 per 1M input/output tokens while Nova Micro family is the cheapest among Open Source options.

        Input and Output Prices
        -

        Fig. 7.8 Input and Output Prices Comparison.

        +

        Fig. 8.8 Input and Output Prices Comparison.

        -

        Latency figures in Fig. 7.9 put GPT-4o (Nov ‘24) as the best performing model but Llama, Nova Micro, Phi and Mistral model families all have options with latency of half a second or better beating Gemini and Claude models considered as well as GPT-4o mini.

        +

        Latency figures in Fig. 8.9 put GPT-4o (Nov ‘24) as the best performing model but Llama, Nova Micro, Phi and Mistral model families all have options with latency of half a second or better beating Gemini and Claude models considered as well as GPT-4o mini.

        Latency Comparison
        -

        Fig. 7.9 Latency Comparison.

        +

        Fig. 8.9 Latency Comparison.

        This analysis provides a framework for evaluating key performance considerations when selecting an LLM. While the specific figures for cost, latency, and quality change frequently (often daily) as providers update their offerings and pricing, the fundamental tradeoffs remain relevant. When evaluating model suitability for a specific use case, practitioners should carefully consider:

        @@ -493,9 +502,9 @@

        -

        7.2.3. Licensing

        +

        8.2.3. Licensing

        When evaluating open-source LLMs, it’s important to consider licensing and data usage policies. Some models may require attribution or commercial use licenses, while others may be more permissive. Additionally, ensure that the model’s training data is compatible with your intended use case and complies with relevant data protection laws.

        -

        The licensing landscape for LLMs spans from highly permissive to custom and restricted usage. Table 7.2 provides a summary of the licensing terms for some of the most popular open source LLMs. We observe two types of licenses:

        +

        The licensing landscape for LLMs spans from highly permissive to custom and restricted usage. Table 8.2 provides a summary of the licensing terms for some of the most popular open source LLMs. We observe two types of licenses:

        • Traditional Open Source:

            @@ -512,7 +521,7 @@

            -

        + @@ -556,19 +565,19 @@

        [Penedo et al., 2024], FineWeb is made of a 15-trillion token dataset derived from 96 Common Crawl snapshots that produces better-performing LLMs than other open pretraining datasets. Additionally, data curation codebase and all of the models trained during our ablation experiments are made available. FineWeb is a fine example of an initiative that helps minimize the gap between proprietary and public knowledge.

        -

        7.2.4. Community Support

        +

        8.2.4. Community Support

        Community support plays a vital role in the open-source LLM ecosystem. Active communities contribute to model development, provide technical assistance, and share valuable resources. When evaluating open-source LLMs, the strength and engagement of the community should be a key consideration, as it directly impacts the model’s long-term viability and practical utility.

        The popularity of different model families reflects their community adoption. In 2024, the Qwen and Llama families have emerged as clear favorites, with Qwen2.5-1.5B-Instruct alone representing 35% of total open source models downloads in 2024.

        Hugging Face Downloads
        -

        Fig. 7.10 Hugging Face Model Downloads in 2024 as of December 22 of the same year [Face, 2024t].

        +

        Fig. 8.10 Hugging Face Model Downloads in 2024 as of December 22 of the same year [Face, 2024t].

        Strong communities accelerate model innovation through collective effort. When developers and researchers collaborate on model development, they create a powerful ecosystem of continuous improvement. Through transparent sharing of findings, they enable rapid development of novel applications and specialized model variants for specific domains. This collaborative environment naturally leads to the establishment of best practices and frameworks that benefit the entire community. The success of this community-driven approach is evident in models like Qwen2.5-1.5B-Instruct, which has spawned 200+ derivative models through post-training adaptations [Qwen, 2024b].

        -

        7.2.5. Customization

        +

        8.2.5. Customization

        Model customization is an important consideration when selecting an open-source LLM. Adapting and fine-tuning to specific use cases can significantly impact practical utility and performance in production environments.

        Model providers increasingly offer streamlined fine-tuning services. For example, Mistral demonstrates an accessible approach to model customization. The code below shows Mistral’s straightforward fine-tuning API. The example shows how to create and start a fine-tuning job with just a few lines of code. The fine-tuning job is configured with the base model “open-mistral-7b” and uses training and validation files from the Ultrachat dataset [Face, 2024u]. This API design makes it easy to experiment with model customization while maintaining control over the training process.

        @@ -590,7 +599,7 @@

        created_jobs -

        For more comprehensive customization needs, Hugging Face’s Transformer Reinforcement Learning (TRL) toolkit provides robust capabilities for model adaptation. Built on the Transformers library, TRL supports [Face, 2024d]:

        +

        For more comprehensive customization needs, Hugging Face’s Transformer Reinforcement Learning (TRL) toolkit provides robust capabilities for model adaptation. Built on the Transformers library, TRL supports [Face, 2024d]:

        • Supervised Fine-Tuning (SFT)

        • Reward Modeling (RM)

        • @@ -598,9 +607,9 @@

          Case Study: Aligning a Language Model to a Policy, we will explore how to use TRL to fine-tune a model to align with user preferences.

          -

          Successful model customization demands managing critical resources throughout the development lifecycle. This includes rigorous dataset preparation and validation to ensure high-quality training data, careful configuration of training infrastructure to optimize computational resources, systematic experimentation iterations while managing associated costs, comprehensive performance evaluation frameworks to measure improvements, and thoughtful deployment architecture planning to ensure smooth production integration. Of course, actual cost of storage and inference should be taken into consideration. Table 7.3 shows as an example the cost of associated with fine-tuning Mistral models [AI, 2024a].

          +

          Successful model customization demands managing critical resources throughout the development lifecycle. This includes rigorous dataset preparation and validation to ensure high-quality training data, careful configuration of training infrastructure to optimize computational resources, systematic experimentation iterations while managing associated costs, comprehensive performance evaluation frameworks to measure improvements, and thoughtful deployment architecture planning to ensure smooth production integration. Of course, actual cost of storage and inference should be taken into consideration. Table 8.3 shows as an example the cost of associated with fine-tuning Mistral models [AI, 2024a].

        Table 7.1 Benchmark results for Llama 2 family of models.Table 8.1 Benchmark results for Llama 2 family of models.

        Model

        Size

        Table 7.2 Open Source LLMs.Table 8.2 Open Source LLMs.

        Creator

        LLM

        - + @@ -666,10 +675,10 @@

        -

        7.3. Tools for Local LLM Deployment

        +

        8.3. Tools for Local LLM Deployment

        Local LLM deployment tools generally fall into two categories: inference-focused tools that prioritize performance and programmability for technical users requiring production-grade deployments, and user interface (UI) tools that emphasize accessibility through graphical interfaces for non-technical users, trading some performance for ease of use and broader adoption. In the following sections we will explore some of these tools discussing their features, capabilities, and trade-offs.

        -

        7.3.1. Serving Models

        +

        8.3.1. Serving Models

        Serving an LLM model involves making it available for inference by setting up infrastructure to process requests and manage resources efficiently. This serving layer handles several key responsibilities, from loading model weights and managing compute resources to processing requests and optimizing performance. Let’s examine the core components of model serving:

        1. Model Loading and Initialization

        2. @@ -707,11 +716,11 @@

          Fig. 7.11. Getting this layer right is crucial for building locally-served reliable AI-powered applications, as it directly impacts the end-user experience in terms of response times, reliability, and resource efficiency.

          +

          The serving layer acts as the bridge between the LLM and applications while working on top of a hardware stack as shown in Fig. 8.11. Getting this layer right is crucial for building locally-served reliable AI-powered applications, as it directly impacts the end-user experience in terms of response times, reliability, and resource efficiency.

          Local Inference Server
          -

          Fig. 7.11 Local Inference Server.

          +

          Fig. 8.11 Local Inference Server.

          Model inference can be performed on Open Source models using cloud solutions such as Groq, Cerebras Systems, and SambaNova Systems. Here, we limit our scope to Open Source solutions that enable inference on local machines which includes consumer hardware. We will cover the following:

          @@ -722,7 +731,7 @@

          -

          7.3.1.1. LLama.cpp

          +

          8.3.1.1. LLama.cpp

          LLama.cpp [Gerganov and contributors, 2024a] is an MIT-licensed open source optimized implementation of the LLama model architecture designed to run efficiently on machines with limited memory.

          Originally developed by Georgi Gerganov and today counting with hundreds of contributors, this C/C++ LLama version provides a simplified interface and advanced features that allow language models to run locally without overwhelming systems. With the ability to run in resource-constrained environments, LLama.cpp makes powerful language models more accessible and practical for a variety of applications.

          In its “Manifesto” [Gerganov and others, 2023], the author highlights the significant potential in bringing AI from cloud to edge devices, emphasizing the importance of keeping development lightweight, experimental, and enjoyable rather than getting bogged down in complex engineering challenges. The author states a vision that emphasizes maintaining an exploratory, hacker-minded approach while building practical edge computing solutions highlighting the following core principles:

          @@ -917,7 +926,7 @@

          -

          7.3.1.2. Llamafile

          +

          8.3.1.2. Llamafile

          Developed by Occupy Wall Street’s former activist, Justine Tunney, Llamafile [Mozilla Ocho, 2024] is an Appache 2.0 licensed open source tool that combines the power of LLama.cpp with Cosmopolitan Libc, a universal C standard library that allows creating portable executables compatible with multiple operating systems.

          In this way, Llamafile reduces all the complexity of LLMs to a single executable file (called a “llamafile”) that runs locally without installation. Key advantages of Llamafile over plain Llama.cpp include:

            @@ -962,7 +971,7 @@

            http://localhost:8080. And we can use it as demonstrated in the previous section.

        -

        7.3.1.3. Ollama

        +

        8.3.1.3. Ollama

        Ollama is a lightweight, MIT-licensed open-source tool for running LLMs locally. It provides a simple interface for interacting with a wide range of language models, including popular models like Llama 3.1 and Llama 3.2. Ollama is designed to be easy to install and use, making it a popular choice for developers who want to run LLMs locally without the need for extensive setup or configuration. Ollama’s key advantages include:

        1. Model Management

        2. @@ -1056,10 +1065,10 @@

          -

          7.3.1.4. Comparison

          -

          Each solution offers distinct advantages and tradeoffs that make them suitable for different use cases. At a high-level, Ollama is the easiest to install and use and has become the most popular choice for your average use case, Llamafile is the easiest to distribute and a good choice when portability is a priority, and Llama.cpp is the most customizable and performant solution as summarized in Table 7.4.

          +

          8.3.1.4. Comparison

          +

          Each solution offers distinct advantages and tradeoffs that make them suitable for different use cases. At a high-level, Ollama is the easiest to install and use and has become the most popular choice for your average use case, Llamafile is the easiest to distribute and a good choice when portability is a priority, and Llama.cpp is the most customizable and performant solution as summarized in Table 8.4.

        Table 7.3 Mistral fine-tuning costs as of December 22, 2024.Table 8.3 Mistral fine-tuning costs as of December 22, 2024.

        Model

        One-off training (/M tokens)

        - + @@ -1112,10 +1121,10 @@

        -

        7.3.2. UI

        +

        8.3.2. UI

        There is a growing number of UI tools for local LLM deployment that aim at providing a more user-friendly experience. Ranging from closed-source to open-source solutions across a range of features and capabilities. We will discuss LM Studio, Jan, and OpenWebUI.

        -

        7.3.2.1. LM Studio

        +

        8.3.2.1. LM Studio

        LM Studio [LM Studio, 2024] is a closed-source GUI for running LLMs locally. In the context of local deployment, LM Studio positions itself as a more user-friendly, feature-rich solution compared to the other tools. It’s particularly valuable for developers transitioning from cloud APIs to local deployment, and for users who prefer graphical interfaces over command-line tools. Key Features of LM Studio include:

        • Model Parameter Customization: Allows adjusting temperature, maximum tokens, frequency penalty, and other settings

        • @@ -1123,23 +1132,23 @@

          Fig. 7.12 and Fig. 7.13 show LM Studio’s chat interface and server, respectively.

          +

          Fig. 8.12 and Fig. 8.13 show LM Studio’s chat interface and server, respectively.

          LM Studio
          -

          Fig. 7.12 LM Studio Chat Interface.

          +

          Fig. 8.12 LM Studio Chat Interface.

          LM Studio Server
          -

          Fig. 7.13 LM Studio Server.

          +

          Fig. 8.13 LM Studio Server.

          One important feature of LM Studio is that it provides machine specification verification capabilities, checking computer specifications like GPU and memory to report compatible models therefore helping users choose the right model. It also includes a local inference server for developers that allows setting up a local HTTP server similar to OpenAI’s API. Importantly, LM Studio’s OpenAI API compatibility is a particularly strong feature for developers looking to move their applications from cloud to local deployment with minimal code changes.

        -

        7.3.2.2. Jan

        +

        8.3.2.2. Jan

        Jan is an open source ChatGPT-alternative that runs local models. Its model’s library contains popular LLMs like Llama, Gemma, Mistral, or Qwen. Key Features of Jan include:

        1. User-Friendly Interface: Run AI models with just a few clicks

        2. @@ -1148,16 +1157,16 @@

          Fig. 7.14 shows Jan’s chat interface.

          +

          Jan has a default C++ inference server built on top of llama.cpp and provides an OpenAI-compatible API. Jan natively supports GGUF (through a llama.cpp engine) and TensorRT (through a TRT-LLM engine). HuggingFace models can be downloaded directly using the model’s ID or URL. User can optionally use cloud-based models (e.g. GPT, Claude models). Fig. 8.14 shows Jan’s chat interface.

          Jan
          -

          Fig. 7.14 Jan Chat Interface.

          +

          Fig. 8.14 Jan Chat Interface.

        -

        7.3.2.3. Open WebUI

        +

        8.3.2.3. Open WebUI

        Open WebUI is an open-source web interface designed to enhance the local AI model experience, particularly for Ollama and OpenAI-compatible APIs. It aims to provide enterprise-grade features while maintaining user-friendliness. OpenWebUI’s core features include:

        1. Advanced User Interface

          @@ -1187,20 +1196,20 @@

          Fig. 7.15 shows Open WebUI’s chat interface.

          +

          Fig. 8.15 shows Open WebUI’s chat interface.

          Open WebUI
          -

          Fig. 7.15 Open WebUI Chat Interface.

          +

          Fig. 8.15 Open WebUI Chat Interface.

          While Open WebUI offers advanced capabilities including RAG and multi-model support, these features require more system resources than simpler alternatives. Open WebUI is likely to be adopted by enterprise users who require advanced features and a more user-friendly interface.

        -

        7.3.2.4. Comparison

        -

        LM Studio excels at providing individual developers with a smooth transition from cloud APIs to local deployment, offering an intuitive interface and robust API compatibility, however it is closed-source. Jan focuses on simplicity and accessibility, making it ideal for personal use and basic deployments while maintaining open-source benefits. OpenWebUI makes additional features available to enterprise users and teams requiring advanced features like RAG, collaboration tools, and granular access controls, though this may come at the cost of increased complexity and resource requirements. We compare the three tools in Table 7.5.

        +

        8.3.2.4. Comparison

        +

        LM Studio excels at providing individual developers with a smooth transition from cloud APIs to local deployment, offering an intuitive interface and robust API compatibility, however it is closed-source. Jan focuses on simplicity and accessibility, making it ideal for personal use and basic deployments while maintaining open-source benefits. OpenWebUI makes additional features available to enterprise users and teams requiring advanced features like RAG, collaboration tools, and granular access controls, though this may come at the cost of increased complexity and resource requirements. We compare the three tools in Table 8.5.

        Table 7.4 lama.cpp vs Ollama vs Llamafile ComparisonTable 8.4 lama.cpp vs Ollama vs Llamafile Comparison

        Feature

        Ollama

        - + @@ -1265,7 +1274,7 @@

        -

        7.4. Case Study: The Effect of Quantization on LLM Performance

        +

        8.4. Case Study: The Effect of Quantization on LLM Performance

        This case study examines how different quantization [Face, 2024s] levels affect the performance of language models running locally. Quantization is a crucial technique for reducing model size and memory footprint while enhancing inference speed, but it comes with potential tradeoffs in model quality. Understanding these tradeoffs is essential for practitioners deploying LLMs in resource-constrained environments.

        Using the Qwen 2.5 0.5B model as our baseline, we’ll compare four variants:

          @@ -1292,7 +1301,7 @@

          -

          7.4.1. Prompts Dataset

          +

          8.4.1. Prompts Dataset

          To evaluate the impact of quantization on model performance, we first need a set of prompts that will serve as input data for our experiments. We’ll construct a dataset from WikiText-2 [Salesforce, 2024], which contains Wikipedia excerpts.

          In our experiments, we will use a total of NUM_PROMPTS prompts that vary in length from MIN_PROMPT_LENGTH to MAX_PROMPT_LENGTH tokens. Using a fixed set of prompts ensures consistent evaluation across model variants and enables direct comparison of metrics like perplexity and throughput.

          @@ -1356,19 +1365,19 @@

          -

          7.4.2. Quantization

          +

          8.4.2. Quantization

          We can quantize a model using the llama-quantize CLI. For instance, to quantize the Qwen 2.5 0.5B model to Q4_K, we can run the following command:

          ./llama-quantize -m ./models/qwen2.5-0.5b-instruct-fp16.gguf ./models/qwen2.5-0.5b-instruct-q8_0.gguf Q4_K
           
          -

          Table 7.6 describes the key quantization levels used in this study [Hugging Face, 2024w], where:

          +

          Table 8.6 describes the key quantization levels used in this study [Hugging Face, 2024w], where:

          • q is the quantized value

          • block_scale is the scaling factor for the block (with bit width in parentheses)

          • block_min is the block minimum value (with bit width in parentheses)

        Table 7.5 LM Studio vs Jan vs OpenWebUI ComparisonTable 8.5 LM Studio vs Jan vs OpenWebUI Comparison

        Feature Category

        LM Studio

        - + @@ -1397,7 +1406,7 @@

        -

        7.4.3. Benchmarking

        +

        8.4.3. Benchmarking

        We will measure quantized model “quality” by means of perplexity and KL Divergence.

        Perplexity

        Perplexity is a common metric for evaluating language models that measures how well a model predicts a sample of text. Lower perplexity indicates better prediction (less “perplexed” by the text).

        @@ -1438,24 +1447,24 @@

        -

        7.4.4. Results

        -

        The KL divergence and perplexity results in Fig. 7.17 and Fig. 7.16 provide insights into model quality across different quantization levels. Q6 maintains near-perfect correlation (99.90%) with the base model and minimal KL divergence (0.004), indicating very close distribution matching. Q2’s higher KL divergence (0.112) and lower correlation (98.31%) quantify its increased deviation from the base model’s behavior.

        +

        8.4.4. Results

        +

        The KL divergence and perplexity results in Fig. 8.17 and Fig. 8.16 provide insights into model quality across different quantization levels. Q6 maintains near-perfect correlation (99.90%) with the base model and minimal KL divergence (0.004), indicating very close distribution matching. Q2’s higher KL divergence (0.112) and lower correlation (98.31%) quantify its increased deviation from the base model’s behavior.

        Perplexity
        -

        Fig. 7.16 KL Divergence results for Quantization Q2, Q4, and Q6 quantized models.

        +

        Fig. 8.16 KL Divergence results for Quantization Q2, Q4, and Q6 quantized models.

        Perplexity
        -

        Fig. 7.17 Perplexity results for Quantization Q2, Q4, and Q6 quantized models.

        +

        Fig. 8.17 Perplexity results for Quantization Q2, Q4, and Q6 quantized models.

        -

        From Table 7.7, we observe that the Q2 model achieves the smallest size at 390 MiB +

        From Table 8.7, we observe that the Q2 model achieves the smallest size at 390 MiB (67% reduction from base) with prompt throughput of 81 tokens/s, but has the highest perplexity degradation at 10.36%. The Q4 model offers a better balance, with good size savings (60% reduction) and only 3.5% perplexity loss. Q6 comes closest to matching the base model’s performance with just 0.93% perplexity degradation, while still providing 47% size reduction.

        Table 7.6 Quantization LevelsTable 8.6 Quantization Levels

        Quantization

        Description

        - + @@ -1507,16 +1516,16 @@

        -m: Specify model paths for base FP16 model and Q2, Q4, Q6 quantized versions

        This runs text generation on a default benchmark of 128 tokens generation length (configurable via -g parameter).

        -

        Results in Fig. 7.18 indicate the base model delivers text generation performance at 19.73 tokens/s, while the most aggressively quantized Q2 model (390.28 MiB) delivers the highest throughput at 42.62 tokens/s, representing a 2.16x speedup. This pattern continues across Q4 (462.96 MiB, 38.38 tokens/s) and Q6 (614.58 MiB, 35.43 tokens/s), which presents a 1.85x and 1.79x speedup, respectively.

        +

        Results in Fig. 8.18 indicate the base model delivers text generation performance at 19.73 tokens/s, while the most aggressively quantized Q2 model (390.28 MiB) delivers the highest throughput at 42.62 tokens/s, representing a 2.16x speedup. This pattern continues across Q4 (462.96 MiB, 38.38 tokens/s) and Q6 (614.58 MiB, 35.43 tokens/s), which presents a 1.85x and 1.79x speedup, respectively.

        Text Generation Performance
        -

        Fig. 7.18 Text Generation Performance results for Quantization Q2, Q4, Q6 and base models.

        +

        Fig. 8.18 Text Generation Performance results for Quantization Q2, Q4, Q6 and base models.

        -

        Benchmarking was performed on Ubuntu 24.04 LTS for x86_64-linux-gnu on commodity hardware (Table 7.8) with no dedicated GPU demonstrating the feasibility of running LLMs locally by nearly everyone with a personal computer thanks to LLama.cpp.

        +

        Benchmarking was performed on Ubuntu 24.04 LTS for x86_64-linux-gnu on commodity hardware (Table 8.8) with no dedicated GPU demonstrating the feasibility of running LLMs locally by nearly everyone with a personal computer thanks to LLama.cpp.

        Table 7.7 Quantization BenchmarksTable 8.7 Quantization Benchmarks

        Model

        Size (MiB)

        - + @@ -1536,14 +1545,14 @@

        -

        7.4.5. Takeaways

        +

        8.4.5. Takeaways

        The quantization analysis of the Qwen 2.5 0.5B model demonstrates a clear trade-off among model size, inference speed, and prediction quality. While the base model (1170 MiB) maintains the highest accuracy it operates at the lowest text generation and prompt throughput of 19.73 tokens/s and 94.39 tokens/s, respectively. In contrast, the Q2_K quantization achieves significant size reduction (67%) and the highest throughput (42.62 tokens/s), but exhibits the largest quality degradation with a 10.36% perplexity increase and lowest KL divergence among quantized models. Q4_K emerges as a compelling middle ground, offering substantial size reduction (60%) and strong text generation and prompt throughput performance (38.38 tokens/s and 77.08 tokens/s, respectively), while maintaining good model quality with only 3.5% perplexity degradation and middle-ground KL divergence level.

        These results, achieved on commodity CPU hardware, demonstrate that quantization can significantly improve inference speed and reduce model size while maintaining acceptable quality thresholds, making large language models more accessible for resource-constrained environments.

        It is important to note that these results are not meant to be exhaustive and are only meant to provide a general idea of the trade-offs involved in quantization. Targeted benchmarks should be performed for specific use cases and models to best reflect real-world performance.

        -

        7.5. Conclusion

        +

        8.5. Conclusion

        Running open source language models locally represents a compelling proposition in how we interact with AI technology. The transition from cloud-based to local deployment offers important advantages in terms of privacy, cost control, and customization flexibility, while introducing important technical considerations around resource management and performance optimization. The growing ecosystem of tools and frameworks, from low-level libraries like llama.cpp to user-friendly interfaces like LM Studio and Jan, has made local deployment increasingly accessible to both individual developers and organizations.

        Our case study demonstrated that quantization can significantly improve inference speed and reduce model size while maintaining acceptable quality thresholds, making large language models more accessible for resource-constrained environments. As demonstrated in our case study with the Qwen 2.5 0.5B model, practitioners can achieve significant reductions in model size and improvements in inference speed while maintaining acceptable performance levels. The Q4_K quantization scheme emerged as a particularly effective compromise, offering substantial size reduction (60%) and strong throughput while limiting quality degradation to just 3.5% in perplexity measures.

        Looking ahead, the continued development of open source models and deployment tools suggests a future where local AI deployment becomes increasingly viable and sophisticated. The success of open source models like Qwen and Llama, combined with improvements in local model serving and techniques couple with efficient small language models (SLMs), indicate that local deployment will likely play an increasingly important role in the AI landscape. However, practitioners must carefully evaluate their specific requirements across dimensions like task suitability, resource constraints, and performance needs when choosing between local and cloud-based deployment strategies.

        @@ -1560,12 +1569,8 @@

        -

        7.6. References

        +

        8.6. References

        -
        -[AI24] -

        DeepSeek AI. Deepseek-v3 technical report. Technical Report, 2024. URL: https://github.com/deepseek-ai/DeepSeek-V3/blob/main/DeepSeek_V3.pdf.

        -
        [AI4c]

        Meta AI. The llama 3 herd of models. 2024c. URL: https://arxiv.org/abs/2407.21783, arXiv:2407.21783.

        @@ -1599,7 +1604,11 @@

        (1,2)

        Andrei Betlen and contributors. Llama-cpp-python. GitHub Repository, 2024. Python bindings for llama.cpp library enabling high-performance inference of LLaMA models. URL: https://github.com/abetlen/llama-cpp-python.

        -
        +
        +[Dee24] +

        DeepSeek. Deepseek-v3 technical report. Technical Report, 2024. URL: https://github.com/deepseek-ai/DeepSeek-V3/blob/main/DeepSeek_V3.pdf.

        +
        +
        [Fac4d]

        Hugging Face. Trl. 2024d. TRL. URL: https://huggingface.co/docs/trl/en/index.

        @@ -1662,7 +1671,7 @@

        [Rev24]

        Harvard Law Review. Nyt v. openai: the times's about-face. https://harvardlawreview.org/blog/2024/04/nyt-v-openai-the-timess-about-face/, 2024. Accessed: 2024.

        -
        +
        [TMS+23]

        Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cristian Canton Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez, Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushkar Mishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing Ellen Tan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, and Thomas Scialom. Llama 2: open foundation and fine-tuned chat models. 2023. URL: https://arxiv.org/abs/2307.09288, arXiv:2307.09288.

        @@ -1732,11 +1741,11 @@

        Table 7.8 Benchmarking HardwareTable 8.8 Benchmarking Hardware

        Device

        Description

        - + @@ -1213,10 +1222,10 @@

        -

        5.6.2.1. Rules-Based Safety Filtering

        -

        Examples of tools that can be used as rules-based safety filters are Webpurify, LLM-Guard [ProtectAI, 2024], AWS Comprehend [Amazon Web Services, 2024], and NeMo Guardrails [NVIDIA, 2024] as detailed in Table 5.2.

        +

        6.6.2.1. Rules-Based Safety Filtering

        +

        Examples of tools that can be used as rules-based safety filters are Webpurify, LLM-Guard [ProtectAI, 2024], AWS Comprehend [Amazon Web Services, 2024], and NeMo Guardrails [NVIDIA, 2024] as detailed in Table 6.2.

        Table 5.1 Representative Safety Layer Risk Map.Table 6.1 Representative Safety Layer Risk Map.

        Risk

        Prompt

        - + @@ -1275,13 +1284,13 @@

        -

        5.6.2.2. LLM-Based Safety Filtering

        +

        6.6.2.2. LLM-Based Safety Filtering

        Alternatively, an LLM-based component can be used as a content filter. Here, we observe three types os approaches: 1. Moderation API, 2. Fine-Tuned Open Source Models, and 3. Custom Moderation.

        Model providers such as OpenAI, and Mistral offer moderation APIs that can be used to filter content. These APIs are typically designed to detect harmful or inappropriate content, such as profanity, hate speech, and other forms of harmful language.

        -

        Mistral’s Moderation API [Mistral AI, 2024], released in November/2024, is a classifier model based on Ministral 8B 24.10. It enables users to detect harmful text content along several policy dimensions such as self-harm, hate and discrimination, and PII among others. It can be used to classify both raw text or conversational content. We will cover this API in more detail in the Case Study.

        +

        Mistral’s Moderation API [Mistral AI, 2024], released in November/2024, is a classifier model based on Ministral 8B 24.10. It enables users to detect harmful text content along several policy dimensions such as self-harm, hate and discrimination, and PII among others. It can be used to classify both raw text or conversational content. We will cover this API in more detail in the Case Study.

        # Mistral's Moderation API - Raw Text
         import os
         from mistralai import Mistral
        @@ -1317,7 +1326,7 @@ 

        print(response)

        -

        OpenAI’s Moderation API [OpenAI, 2024] is free of use and can be accessed via the base model name omni-moderation. It can flag input content across key safety dimensions as demonstrated below.

        +

        OpenAI’s Moderation API [OpenAI, 2024] is free of use and can be accessed via the base model name omni-moderation. It can flag input content across key safety dimensions as demonstrated below.

        from dotenv import load_dotenv
        @@ -1466,29 +1475,29 @@ 

        [IBM, 2024]. The collection comprises two classes of models:

        +

        IBM Granite Guardian is a new competitor to Llama Guard family. It is collection of models designed to help govern key risk dimensions as defined by IBM’s AI Risk Atlas [IBM, 2024]. The collection comprises two classes of models:

        1. Granite-Guardian-3.0-2B and Granite-Guardian-3.0-8B for detecting different forms of harmful content

        2. Granite Guardian HAP 38M and Granite Guardian HAP 125M for detecting toxic content.

        -

        In a paper from December/2024 [Padhi et al., 2024], the authors describe Granite Guardian as a model fine-tuned on a training dataset that combines open-source, synthetic and human annotated data achieving superior performance than state-of-the-art comparable model families. In Fig. 5.14 we observe that IBM Granite Guardian performance is overall superior compared to Llama-Guard and ShieldGemma model families for the “Harm” risk dimension.

        +

        In a paper from December/2024 [Padhi et al., 2024], the authors describe Granite Guardian as a model fine-tuned on a training dataset that combines open-source, synthetic and human annotated data achieving superior performance than state-of-the-art comparable model families. In Fig. 6.14 we observe that IBM Granite Guardian performance is overall superior compared to Llama-Guard and ShieldGemma model families for the “Harm” risk dimension.

        IBM Granite Guardian performance for the "Harm" risk dimension.
        -

        Fig. 5.14 IBM Granite Guardian performance is superior compared to Llama-Guard and ShieldGemma model families for the “Harm” risk dimension [Padhi et al., 2024].

        +

        Fig. 6.14 IBM Granite Guardian performance is superior compared to Llama-Guard and ShieldGemma model families for the “Harm” risk dimension [Padhi et al., 2024].

        The industry is increasingly focusing on the fine-tuning of pre-trained base models targeting a specific dimension of requirements and standards, here Safety being a critical one. This trend encompasses the release of open-source, fine-tuned safety models that can act as protective guardrails for LLM applications, as exemplified by LLaMa-Guard and IBM Granite Guardian. Additionally, there is a notable rise in models fine-tuned through techniques such as Reinforcement Learning from Human Feedback (RLHF), utilizing human preference datasets that incorporate safety considerations. These specialized models can function as safety filters as discussed but also as main models that alone could accomplished their original intended task but safely. We will cover this specific topic of preference-based alignment in the next Chapter Preference-Based Alignment, where we will explore the process of aligning language models with human preferences ultimately leading to the development of an open source fine-tuned model that complies with user provided policy-based requirements.

        -

        5.6.2.3. Custom Moderation

        +

        6.6.2.3. Custom Moderation

        We have covered filtering-based approaches using moderation APIs and fine-tuned open source models. Rather than relying on external filters, LLMs themselves can be guided to avoid harmful content through careful prompt engineering.

        Custom moderation offers a tailored content filtering approach, ensuring adherence to your own specific standards. As we have seen, each filtering-based approach we have discussed, while each having its own strengths, they all implement or enable safety according to a pre-defined dimension of requirements and standards. Custom moderation, on the other hand, provides greater control compared to general moderation APIs or fine-tuned open source models though it requires more setup and maintenance.

        -

        A common approach, when building a custom LLM-based filter, is to build an LLM-as-a-Judge filter as illustrated in Fig. 5.15. It a simple idea to use an LLM to judge the output of another LLM as well as user prompt in the context of your LLM-based application (please see Section “Model Based Evaluation” - Chapter Evals for design and best practices of LLM-based evals.)

        +

        A common approach, when building a custom LLM-based filter, is to build an LLM-as-a-Judge filter as illustrated in Fig. 6.15. It a simple idea to use an LLM to judge the output of another LLM as well as user prompt in the context of your LLM-based application (please see Section “Model Based Evaluation” - Chapter Evals for design and best practices of LLM-based evals.)

        LLM-as-a-judge as safety filter.
        -

        Fig. 5.15 LLM-as-a-judge as safety filter.

        +

        Fig. 6.15 LLM-as-a-judge as safety filter.

        Below we display an example of a prompt engineered for an LLM-as-a-judge to be used as a safety filter.

        @@ -1558,17 +1567,17 @@

        -

        5.7. Case Study: Implementing a Safety Filter

        +

        6.7. Case Study: Implementing a Safety Filter

        We will implement a basic safety filter for a K-12 application that will be used to filter content in a chat interface. The application will be designed to be used in a classroom setting where students and teachers can interact with the model to ask questions and receive answers. The safety filter will be designed to filter out harmful content such as profanity, hate speech, and other inappropriate content.

        In this stylized case study, we will limit our scope to the implementation of a safety filter for user prompts. We will not cover the implementation of the application itself or filtering the model’s output but rather focus on the user prompt safety filter. In real-world applications, an input policy would be paramount to better define what safety means before we identify associated risks and consecutive implementation decisions. Here, we will discuss the implementation of safety through the design of the evals dataset (you will later see, skipping policy will lead to trouble later in the case study!)

        -

        5.7.1. Evals Dataset

        +

        6.7.1. Evals Dataset

        Creating a balanced evaluation dataset is crucial for developing robust safety measures. The dataset should be a well balanced set of “good” and “bad” samples to avoid biasing the model’s behavior in either direction.

        For this evaluation, we will create a dataset with NUM_SAMPLES examples, evenly split between good and bad samples (GOOD_SAMPLES and BAD_SAMPLES, respectively).

        The good samples will be sourced from the UltraFeedback Binarized dataset [H4, 2024z], which contains high-quality, appropriate prompts that represent normal user interactions, often utilized to fine-tune models for instruction-following, truthfulness, honesty and helpfulness in a preference-based alignment process.

        The bad samples will come from two sources:

          -
        1. Profanity keywords from the Surge AI Profanity Dataset [Surge AI, 2024] - This provides examples of explicit inappropriate content.

        2. +
        3. Profanity keywords from the Surge AI Profanity Dataset [Surge AI, 2024] - This provides examples of explicit inappropriate content.

        4. Prompts sourced from Salad-Bench - These represent more subtle forms of harmful content like scams, harassment, or dangerous instructions, hence not necessarily mentioning an inappropriate keywords but rather a potentially harmful instruction.

        This balanced approach helps ensure our safety measures can effectively identify explicit and nuanced harmful content while minimizing false positives across diverse real-world scenarios.

        @@ -1581,7 +1590,7 @@

        -

        5.7.1.1. Bad Samples

        +

        6.7.1.1. Bad Samples

        def get_profanity_samples(num_samples, show_stats=True):
        @@ -1723,7 +1732,7 @@ 

        -

        5.7.1.2. Good Samples

        +

        6.7.1.2. Good Samples

        def get_good_samples(num_samples):
        @@ -1904,7 +1913,7 @@ 

        -

        5.7.2. Safety Filters

        +

        6.7.2. Safety Filters

        We will implement four safety filters, one for each of the following:

        1. LLM-Guard

        2. @@ -1970,7 +1979,7 @@

          -

          5.7.2.1. LLM-Guard

          +

          6.7.2.1. LLM-Guard

          Next, we implement a concrete validator using LLM Guard. The LLMGuardValidator class combines two key scanners:

          • BanTopics: Flags content containing banned topics

          • @@ -2063,7 +2072,7 @@

            -

            5.7.2.2. Mistral Moderation API

            +

            6.7.2.2. Mistral Moderation API

            You will need a Mistral API key to use the Mistral Moderation API. You can get one by signing up for a Mistral account and creating an API key, which we will assume is stored in a local .env file under the MISTRAL_API_KEY variable.

            The MistralValidator class implements a safety validator using Mistral’s moderation API. It takes text input and returns a ValidationResult indicating whether the text is unsafe based on Mistral moderation categories. Example:

            {'sexual': False,
            @@ -2143,7 +2152,7 @@ 

            -

            5.7.2.3. OpenAI Moderation API

            +

            6.7.2.3. OpenAI Moderation API

            from openai import OpenAI
            @@ -2207,7 +2216,7 @@ 

            -

            5.7.2.4. Custom Judge Validator

            +

            6.7.2.4. Custom Judge Validator

            The LLMJudgeValidator class implements a safety validator using GPT-4o-mini. It takes text input and returns a ValidationResult indicating whether the text is unsafe based on an input safety prompt.

            @@ -2292,7 +2301,7 @@

            -

            5.7.3. Benchmarking

            +

            6.7.3. Benchmarking

            We are ready to run our four safety filters against our dataset. We will store validation results as well as elapsed time for each validator.

            @@ -2781,7 +2790,7 @@

            5.7.4. Takeaways

            +

            6.7.4. Takeaways

            • Safety is a complex problem and there is no one-size-fits-all solution.

            • Starting with a well-aligned policy is key to developing a robust data and evaluation framework.

            • @@ -2791,14 +2800,14 @@

              -

              5.8. Conclusion

              +

              6.8. Conclusion

              The rapid advancement of large language models has created an unsettling paradox: the same technologies that promise to revolutionize human-AI interaction also harbor significant risks that could undermine the very societies they aim to benefit. Our examination of various safety measures - from constitutional AI to red teaming - reveals that each approach has specific strengths and limitations when implemented in practice. However, instead of waiting for governments, organizations, and the public to catch up, we need to take action now.

              The case study on safety filters demonstrated the complexity of implementing even basic safety measures in real-world applications. What appears safe in one context may be inappropriate in another, and our current methods of safety evaluation often struggle with these nuances. The challenge of developing robust safety measures is further complicated by the potential for feedback loops in the training process - when models are fine-tuned on datasets that may contain hidden biases or problematic content.

              The path forward requires combining technical innovation with practical domain-specific wisdom. Safety in GenAI isn’t just a technical problem to be solved - it’s a mirror reflecting our own values, biases, and aspirations back at us. The growing focus on safety across the AI community, from open-source initiatives to corporate governance frameworks, provides a foundation for developing more robust safety measures. However, technologists working in isolation cannot solve these challenges - and may even perpetuate them unknowingly. Instead, domain experts across different verticals must come together to collaboratively define what safety means in the context of their specific users and broader society in work in collaboration with the AI community.

              Only through this cross-disciplinary collaboration can we move beyond the current uncertainty into a future where safety and innovation reinforce rather than oppose each other. This requires building bridges between technical experts, ethicists, policymakers, and the communities they serve to develop holistic frameworks that protect while enabling progress.

        -

        5.9. Citation

        +

        6.9. Citation

        CC BY-NC-SA 4.0

        @misc{tharsistpsouza2024tamingllms,
           author = {Tharsis T. P. Souza},
        @@ -2812,51 +2821,47 @@ 

        -

        5.10. References

        +

        6.10. References

        -
        -[AI24] -

        Meta AI. Llamaguard: llm-based input-output safeguard for human-ai conversations. Meta AI Research Publications, 2024. URL: https://ai.meta.com/research/publications/llama-guard-llm-based-input-output-safeguard-for-human-ai-conversations/.

        -
        -
        +
        [ASA24] (1,2)

        Jide Alaga, Jonas Schuett, and Markus Anderljung. A grading rubric for ai safety frameworks. 2024. URL: https://arxiv.org/abs/2409.08751, arXiv:2409.08751.

        -
        +
        [ABC+23] (1,2)

        Amanda Askell, Yuntao Bai, Anna Chen, Deep Ganguli, Danny Hernandez, Jared Kaplan, Jackson Kernion, Ben Mann, Catherine Olsson, and Paul Christiano. Constitutional ai: harmlessness from ai feedback. 2023. URL: https://www.anthropic.com/research/constitutional-ai-harmlessness-from-ai-feedback.

        -
        +
        [BHY+24]

        Yoshua Bengio, Geoffrey Hinton, Andrew Yao, Dawn Song, Pieter Abbeel, Trevor Darrell, Yuval Noah Harari, Ya-Qin Zhang, Lan Xue, Shai Shalev-Shwartz, Gillian Hadfield, Jeff Clune, Tegan Maharaj, Frank Hutter, Atılım Güneş Baydin, Sheila McIlraith, Qiqi Gao, Ashwin Acharya, David Krueger, Anca Dragan, Philip Torr, Stuart Russell, Daniel Kahneman, Jan Brauner, and Sören Mindermann. Managing extreme ai risks amid rapid progress. Science, 384(6698):842–845, 2024. URL: https://www.science.org/doi/abs/10.1126/science.adn0117, arXiv:https://www.science.org/doi/pdf/10.1126/science.adn0117, doi:10.1126/science.adn0117.

        -
        +
        [BBC+24] (1,2)

        Victoria Benjamin, Emily Braca, Israel Carter, Hafsa Kanchwala, Nava Khojasteh, Charly Landow, Yi Luo, Caroline Ma, Anna Magarelli, Rachel Mirin, Avery Moyer, Kayla Simpson, Amelia Skawinski, and Thomas Heverin. Systematically analyzing prompt injection vulnerabilities in diverse llm architectures. 2024. URL: https://arxiv.org/abs/2410.23308, arXiv:2410.23308.

        -
        +
        [BMC+24] (1,2)

        Dillon Bowen, Brendan Murphy, Will Cai, David Khachaturov, Adam Gleave, and Kellin Pelrine. Data poisoning in llms: jailbreak-tuning and scaling laws. 2024. URL: https://arxiv.org/abs/2408.02946, arXiv:2408.02946.

        -
        +
        [CMM+24]

        Erik Cambria, Lorenzo Malandri, Fabio Mercorio, Navid Nobani, and Andrea Seveso. Xai meets llms: a survey of the relation between explainable ai and large language models. 2024. URL: https://arxiv.org/abs/2407.15248, arXiv:2407.15248.

        -
        +
        [Edg24] (1,2)

        Alec Edgington. How to exploit large language models for good or bad. SIAM News, 2024. URL: https://www.siam.org/publications/siam-news/articles/how-to-exploit-large-language-models-for-good-or-bad/.

        -
        +
        [Exa24] (1,2)

        Exabeam. Ai regulations and llm regulations: past, present, and future. Exabeam Blog, 2024. URL: https://www.exabeam.com/explainers/ai-cyber-security/ai-regulations-and-llm-regulations-past-present-and-future/.

        -
        +
        [GRB+24]

        Isabel O. Gallegos, Ryan A. Rossi, Joe Barrow, Md Mehrab Tanjim, Sungchul Kim, Franck Dernoncourt, Tong Yu, Ruiyi Zhang, and Nesreen K. Ahmed. Bias and fairness in large language models: a survey. 2024. URL: https://arxiv.org/abs/2309.00770, arXiv:2309.00770.

        @@ -2864,16 +2869,16 @@

        [H44z]

        Hugging Face H4. Ultrafeedback binarized dataset. 2024z. A dataset of binary preference data for training language models. URL: https://huggingface.co/datasets/HuggingFaceH4/ultrafeedback_binarized.

        -
        +
        [HGP+22]

        Thomas Hartvigsen, Saadia Gabriel, Hamid Palangi, Maarten Sap, Dipankar Ray, and Ece Kamar. ToxiGen: a large-scale machine-generated dataset for adversarial and implicit hate speech detection. In Smaranda Muresan, Preslav Nakov, and Aline Villavicencio, editors, Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), 3309–3326. Dublin, Ireland, May 2022. Association for Computational Linguistics. URL: https://aclanthology.org/2022.acl-long.234, doi:10.18653/v1/2022.acl-long.234.

        -
        +
        [HYM+24] (1,2)

        Lei Huang, Weijiang Yu, Weitao Ma, Weihong Zhong, Zhangyin Feng, Haotian Wang, Qianglong Chen, Weihua Peng, Xiaocheng Feng, Bing Qin, and Ting Liu. A survey on hallucination in large language models: principles, taxonomy, challenges, and open questions. ACM Transactions on Information Systems, November 2024. URL: http://dx.doi.org/10.1145/3703155, doi:10.1145/3703155.

        -
        +
        [LDW+24] (1,2)

        Lijun Li, Bowen Dong, Ruohui Wang, Xuhao Hu, Wangmeng Zuo, Dahua Lin, Yu Qiao, and Jing Shao. Salad-bench: a hierarchical and comprehensive safety benchmark for large language models. 2024. URL: https://arxiv.org/abs/2402.05044, arXiv:2402.05044.

        @@ -2883,30 +2888,34 @@

        (1,2)

        Stephanie Lin, Jacob Hilton, and Owain Evans. Truthfulqa: measuring how models mimic human falsehoods. 2022. URL: https://arxiv.org/abs/2109.07958, arXiv:2109.07958.

        -
        +
        [MPY+24] (1,2)

        Mantas Mazeika, Long Phan, Xuwang Yin, Andy Zou, Zifan Wang, Norman Mu, Elham Sakhaee, Nathaniel Li, Steven Basart, Bo Li, David Forsyth, and Dan Hendrycks. Harmbench: a standardized evaluation framework for automated red teaming and robust refusal. 2024. URL: https://arxiv.org/abs/2402.04249, arXiv:2402.04249.

        +
        +[MA24] +

        Meta-AI. Llamaguard: llm-based input-output safeguard for human-ai conversations. Meta AI Research Publications, 2024. URL: https://ai.meta.com/research/publications/llama-guard-llm-based-input-output-safeguard-for-human-ai-conversations/.

        +
        [MLC24]

        MLCommons. Mlcommons ai illuminate benchmarks. 2024. A collection of standardized benchmarks for evaluating AI systems. URL: https://ailuminate.mlcommons.org/benchmarks/.

        -
        +
        [OAA+24]

        OpenAI, Josh Achiam, Steven Adler, Sandhini Agarwal, Lama Ahmad, Ilge Akkaya, Florencia Leoni Aleman, Diogo Almeida, Janko Altenschmidt, Sam Altman, Shyamal Anadkat, Red Avila, Igor Babuschkin, Suchir Balaji, Valerie Balcom, Paul Baltescu, Haiming Bao, Mohammad Bavarian, Jeff Belgum, Irwan Bello, Jake Berdine, Gabriel Bernadett-Shapiro, Christopher Berner, Lenny Bogdonoff, Oleg Boiko, Madelaine Boyd, Anna-Luisa Brakman, Greg Brockman, Tim Brooks, Miles Brundage, Kevin Button, Trevor Cai, Rosie Campbell, Andrew Cann, Brittany Carey, Chelsea Carlson, Rory Carmichael, Brooke Chan, Che Chang, Fotis Chantzis, Derek Chen, Sully Chen, Ruby Chen, Jason Chen, Mark Chen, Ben Chess, Chester Cho, Casey Chu, Hyung Won Chung, Dave Cummings, Jeremiah Currier, Yunxing Dai, Cory Decareaux, Thomas Degry, Noah Deutsch, Damien Deville, Arka Dhar, David Dohan, Steve Dowling, Sheila Dunning, Adrien Ecoffet, Atty Eleti, Tyna Eloundou, David Farhi, Liam Fedus, Niko Felix, Simón Posada Fishman, Juston Forte, Isabella Fulford, Leo Gao, Elie Georges, Christian Gibson, Vik Goel, Tarun Gogineni, Gabriel Goh, Rapha Gontijo-Lopes, Jonathan Gordon, Morgan Grafstein, Scott Gray, Ryan Greene, Joshua Gross, Shixiang Shane Gu, Yufei Guo, Chris Hallacy, Jesse Han, Jeff Harris, Yuchen He, Mike Heaton, Johannes Heidecke, Chris Hesse, Alan Hickey, Wade Hickey, Peter Hoeschele, Brandon Houghton, Kenny Hsu, Shengli Hu, Xin Hu, Joost Huizinga, Shantanu Jain, Shawn Jain, Joanne Jang, Angela Jiang, Roger Jiang, Haozhun Jin, Denny Jin, Shino Jomoto, Billie Jonn, Heewoo Jun, Tomer Kaftan, Łukasz Kaiser, Ali Kamali, Ingmar Kanitscheider, Nitish Shirish Keskar, Tabarak Khan, Logan Kilpatrick, Jong Wook Kim, Christina Kim, Yongjik Kim, Jan Hendrik Kirchner, Jamie Kiros, Matt Knight, Daniel Kokotajlo, Łukasz Kondraciuk, Andrew Kondrich, Aris Konstantinidis, Kyle Kosic, Gretchen Krueger, Vishal Kuo, Michael Lampe, Ikai Lan, Teddy Lee, Jan Leike, Jade Leung, Daniel Levy, Chak Ming Li, Rachel Lim, Molly Lin, Stephanie Lin, Mateusz Litwin, Theresa Lopez, Ryan Lowe, Patricia Lue, Anna Makanju, Kim Malfacini, Sam Manning, Todor Markov, Yaniv Markovski, Bianca Martin, Katie Mayer, Andrew Mayne, Bob McGrew, Scott Mayer McKinney, Christine McLeavey, Paul McMillan, Jake McNeil, David Medina, Aalok Mehta, Jacob Menick, Luke Metz, Andrey Mishchenko, Pamela Mishkin, Vinnie Monaco, Evan Morikawa, Daniel Mossing, Tong Mu, Mira Murati, Oleg Murk, David Mély, Ashvin Nair, Reiichiro Nakano, Rajeev Nayak, Arvind Neelakantan, Richard Ngo, Hyeonwoo Noh, Long Ouyang, Cullen O'Keefe, Jakub Pachocki, Alex Paino, Joe Palermo, Ashley Pantuliano, Giambattista Parascandolo, Joel Parish, Emy Parparita, Alex Passos, Mikhail Pavlov, Andrew Peng, Adam Perelman, Filipe de Avila Belbute Peres, Michael Petrov, Henrique Ponde de Oliveira Pinto, Michael, Pokorny, Michelle Pokrass, Vitchyr H. Pong, Tolly Powell, Alethea Power, Boris Power, Elizabeth Proehl, Raul Puri, Alec Radford, Jack Rae, Aditya Ramesh, Cameron Raymond, Francis Real, Kendra Rimbach, Carl Ross, Bob Rotsted, Henri Roussez, Nick Ryder, Mario Saltarelli, Ted Sanders, Shibani Santurkar, Girish Sastry, Heather Schmidt, David Schnurr, John Schulman, Daniel Selsam, Kyla Sheppard, Toki Sherbakov, Jessica Shieh, Sarah Shoker, Pranav Shyam, Szymon Sidor, Eric Sigler, Maddie Simens, Jordan Sitkin, Katarina Slama, Ian Sohl, Benjamin Sokolowsky, Yang Song, Natalie Staudacher, Felipe Petroski Such, Natalie Summers, Ilya Sutskever, Jie Tang, Nikolas Tezak, Madeleine B. Thompson, Phil Tillet, Amin Tootoonchian, Elizabeth Tseng, Preston Tuggle, Nick Turley, Jerry Tworek, Juan Felipe Cerón Uribe, Andrea Vallone, Arun Vijayvergiya, Chelsea Voss, Carroll Wainwright, Justin Jay Wang, Alvin Wang, Ben Wang, Jonathan Ward, Jason Wei, CJ Weinmann, Akila Welihinda, Peter Welinder, Jiayi Weng, Lilian Weng, Matt Wiethoff, Dave Willner, Clemens Winter, Samuel Wolrich, Hannah Wong, Lauren Workman, Sherwin Wu, Jeff Wu, Michael Wu, Kai Xiao, Tao Xu, Sarah Yoo, Kevin Yu, Qiming Yuan, Wojciech Zaremba, Rowan Zellers, Chong Zhang, Marvin Zhang, Shengjia Zhao, Tianhao Zheng, Juntang Zhuang, William Zhuk, and Barret Zoph. Gpt-4 technical report. 2024. URL: https://arxiv.org/abs/2303.08774, arXiv:2303.08774.

        -
        +
        [PNC+24] (1,2)

        Inkit Padhi, Manish Nagireddy, Giandomenico Cornacchia, Subhajit Chaudhury, Tejaswini Pedapati, Pierre Dognin, Keerthiram Murugesan, Erik Miehling, Martín Santillán Cooper, Kieran Fraser, Giulio Zizzo, Muhammad Zaid Hameed, Mark Purcell, Michael Desmond, Qian Pan, Inge Vejsbjerg, Elizabeth M. Daly, Michael Hind, Werner Geyer, Ambrish Rawat, Kush R. Varshney, and Prasanna Sattigeri. Granite guardian. 2024. URL: https://arxiv.org/abs/2412.07724, arXiv:2412.07724.

        -
        +
        [PHS+22] (1,2)

        Ethan Perez, Saffron Huang, Francis Song, Trevor Cai, Roman Ring, John Aslanides, Amelia Glaese, Nat McAleese, and Geoffrey Irving. Red teaming language models with language models. 2022. URL: https://arxiv.org/abs/2202.03286, arXiv:2202.03286.

        -
        +
        [SZW+24]

        Oliver J. Sutton, Qinghua Zhou, Wei Wang, Desmond J. Higham, Alexander N. Gorban, Alexander Bastounis, and Ivan Y. Tyukin. Stealth edits to large language models. 2024. URL: https://arxiv.org/abs/2406.12670, arXiv:2406.12670.

        @@ -2915,102 +2924,102 @@

        (1,2)

        Bertie Vidgen, Adarsh Agrawal, Ahmed M. Ahmed, Victor Akinwande, Namir Al-Nuaimi, Najla Alfaraj, Elie Alhajjar, Lora Aroyo, Trupti Bavalatti, Max Bartolo, Borhane Blili-Hamelin, Kurt Bollacker, Rishi Bomassani, Marisa Ferrara Boston, Siméon Campos, Kal Chakra, Canyu Chen, Cody Coleman, Zacharie Delpierre Coudert, Leon Derczynski, Debojyoti Dutta, Ian Eisenberg, James Ezick, Heather Frase, Brian Fuller, Ram Gandikota, Agasthya Gangavarapu, Ananya Gangavarapu, James Gealy, Rajat Ghosh, James Goel, Usman Gohar, Sujata Goswami, Scott A. Hale, Wiebke Hutiri, Joseph Marvin Imperial, Surgan Jandial, Nick Judd, Felix Juefei-Xu, Foutse Khomh, Bhavya Kailkhura, Hannah Rose Kirk, Kevin Klyman, Chris Knotz, Michael Kuchnik, Shachi H. Kumar, Srijan Kumar, Chris Lengerich, Bo Li, Zeyi Liao, Eileen Peters Long, Victor Lu, Sarah Luger, Yifan Mai, Priyanka Mary Mammen, Kelvin Manyeki, Sean McGregor, Virendra Mehta, Shafee Mohammed, Emanuel Moss, Lama Nachman, Dinesh Jinenhally Naganna, Amin Nikanjam, Besmira Nushi, Luis Oala, Iftach Orr, Alicia Parrish, Cigdem Patlak, William Pietri, Forough Poursabzi-Sangdeh, Eleonora Presani, Fabrizio Puletti, Paul Röttger, Saurav Sahay, Tim Santos, Nino Scherrer, Alice Schoenauer Sebag, Patrick Schramowski, Abolfazl Shahbazi, Vin Sharma, Xudong Shen, Vamsi Sistla, Leonard Tang, Davide Testuggine, Vithursan Thangarasa, Elizabeth Anne Watkins, Rebecca Weiss, Chris Welty, Tyler Wilbers, Adina Williams, Carole-Jean Wu, Poonam Yadav, Xianjun Yang, Yi Zeng, Wenhui Zhang, Fedor Zhdanov, Jiacheng Zhu, Percy Liang, Peter Mattson, and Joaquin Vanschoren. Introducing v0.5 of the ai safety benchmark from mlcommons. 2024. URL: https://arxiv.org/abs/2404.12241, arXiv:2404.12241.

        -
        +
        [VSK+24] (1,2)

        Bertie Vidgen, Nino Scherrer, Hannah Rose Kirk, Rebecca Qian, Anand Kannappan, Scott A. Hale, and Paul Röttger. Simplesafetytests: a test suite for identifying critical safety risks in large language models. 2024. URL: https://arxiv.org/abs/2311.08370, arXiv:2311.08370.

        -
        +
        [WMR24]

        Sandra Wachter, Brent Mittelstadt, and Chris Russell. Do large language models have a legal duty to tell the truth? Royal Society Open Science, 11(8):240197, 2024. URL: https://royalsocietypublishing.org/doi/abs/10.1098/rsos.240197, arXiv:https://royalsocietypublishing.org/doi/pdf/10.1098/rsos.240197, doi:10.1098/rsos.240197.

        -
        +
        [YLX24]

        Jiahao Yu, Xingwei Lin, and Xinyu Xing. Gptfuzzer: red teaming large language models with auto-generated safety test cases. Papers with Code, 2024. URL: https://paperswithcode.com/dataset/gptfuzzer.

        -
        +
        [ZYY+24]

        Shuning Zhang, Lyumanshan Ye, Xin Yi, Jingyu Tang, Bo Shui, Haobin Xing, Pengfei Liu, and Hewu Li. "ghost of the past": identifying and resolving privacy leakage from llm's memory through proactive user interaction. 2024. URL: https://arxiv.org/abs/2410.14931, arXiv:2410.14931.

        -
        +
        [Zho24]

        Qinghua Zhou. Stealth edits: detecting stealth edits in llm outputs. Hugging Face Spaces, 2024. URL: https://huggingface.co/spaces/qinghua-zhou/stealth-edits.

        -
        +
        [AmazonWServices24]

        Amazon Web Services. Amazon comprehend - natural language processing service. 2024. AWS natural language processing service for text analysis and content moderation. URL: https://aws.amazon.com/comprehend/.

        -
        +
        [Anthropic24]

        Anthropic. Anthropic's responsible scaling policy. Technical Report, Anthropic, 2024. URL: https://www-cdn.anthropic.com/1adf000c8f675958c2ee23805d91aaade1cd4613/responsible-scaling-policy.pdf.

        -
        +
        [CenterfASafety24a]

        Center for AI Safety. Harmbench. GitHub repository, 2024. Framework for evaluating language model safety. URL: https://github.com/centerforaisafety/HarmBench.

        -
        +
        [CenterfASafety24b]

        Center for AI Safety. Harmbench leaderboard. 2024. Leaderboard tracking performance of language models on safety benchmarks. URL: https://www.harmbench.org/results.

        -
        +
        [DeepMind24] (1,2)

        DeepMind. The frontier safety framework. Technical Report, DeepMind, 2024. URL: https://storage.googleapis.com/deepmind-media/DeepMind.com/Blog/introducing-the-frontier-safety-framework/fsf-technical-report.pdf.

        -
        +
        [EuropeanMAgency24]

        European Medicines Agency. Guiding principles for the use of large language models in regulatory science and medicines regulatory activities. Guidance Document, European Medicines Agency, 2024. URL: https://www.ema.europa.eu/en/documents/other/guiding-principles-use-large-language-models-regulatory-science-medicines-regulatory-activities_en.pdf.

        -
        +
        [FinancialIRAuthority24]

        Financial Industry Regulatory Authority. Artificial intelligence, including large language models and generative ai. Regulatory Notice 24-09, FINRA, 2024. URL: https://www.finra.org/rules-guidance/notices/24-09.

        -
        +
        [IBM24]

        IBM. Ibm watsonx.ai risk atlas. 2024. A framework for identifying and mitigating risks in AI systems. URL: https://www.ibm.com/docs/en/watsonx/saas?topic=ai-risk-atlas.

        -
        +
        [LibraryoCongress23]

        Library of Congress. China: generative ai measures finalized. July 2023. URL: https://www.loc.gov/item/global-legal-monitor/2023-07-18/china-generative-ai-measures-finalized/.

        -
        +
        [MistralAI24]

        Mistral AI. Mistral moderation: a technical report. 2024. URL: https://mistral.ai/news/mistral-moderation/.

        -
        +
        [MLSTeam24]

        ML Safety Team. Safebench: a comprehensive benchmark for llm safety evaluation. ML Safety Website, 2024. URL: https://www.mlsafety.org/safebench.

        -
        +
        [NationalIoSaTechnology24]

        National Institute of Standards and Technology. Ai risk management framework. Technical Report, National Institute of Standards and Technology, 2024. URL: https://www.nist.gov/itl/ai-risk-management-framework.

        -
        +
        [NVIDIA24]

        NVIDIA. Nemo-guardrails: an open-source toolkit for building reliable and safe llm applications. 2024. A framework for creating reliable and safe LLM applications with customizable guardrails. URL: https://github.com/NVIDIA/NeMo-Guardrails.

        -
        +
        [OpenAI24a]

        OpenAI. Openai moderation api. 2024. Documentation for OpenAI's content moderation API. URL: https://platform.openai.com/docs/guides/moderation.

        -
        +
        [OpenAI24b] (1,2)

        OpenAI. Openai preparedness framework. Technical Report, OpenAI, 2024. URL: https://cdn.openai.com/openai-preparedness-framework-beta.pdf.

        -
        +
        [OpenSafetyLab24a]

        OpenSafetyLab. Salad-bench leaderboard. Hugging Face Space, 2024. URL: https://huggingface.co/spaces/OpenSafetyLab/Salad-Bench-Leaderboard.

        -
        +
        [OpenSafetyLab24b]

        OpenSafetyLab. Salad-data: a hierarchical and comprehensive safety dataset for large language models. Hugging Face Dataset, 2024. URL: https://huggingface.co/datasets/OpenSafetyLab/Salad-Data.

        -
        +
        [ProtectAI24]

        ProtectAI. Llm-guard: comprehensive safety and security framework for large language models. 2024. An open-source toolkit for LLM security and safety. URL: https://github.com/protectai/llm-guard.

        -
        +
        [SurgeAI24]

        Surge AI. Surge ai profanity dataset. GitHub repository, 2024. A comprehensive dataset for training and evaluating profanity detection models. URL: https://github.com/surge-ai/profanity.

        @@ -3018,7 +3027,7 @@

        [UKGovernment24]

        UK Government. Ai regulation: a pro-innovation approach. White Paper, Department for Science, Innovation and Technology, 2024. URL: https://www.gov.uk/government/publications/ai-regulation-a-pro-innovation-approach/white-paper.

        -
        +
        [UNICEF24]

        UNICEF. Policy guidance on ai for children. Policy Report, UNICEF Office of Research - Innocenti, 2024. URL: https://www.unicef.org/innocenti/reports/policy-guidance-ai-children.

        @@ -3051,12 +3060,12 @@

        Table 5.2 Rules-Based Safety Filtering Tools.Table 6.2 Rules-Based Safety Filtering Tools.

        Tool

        Key Features

        @@ -1193,7 +1202,7 @@

        [Guidance AI, 2024] and NVIDIA’s Logits Processor Zoo [NVIDIA, 2024a].

        -

        4.5.3. Research and Ongoing Debate

        +

        4.5.3. Research and Ongoing Debate

        The use of structured output for Large Language Models (LLMs) is a developing area. While the ability to constrain LLM outputs offer clear benefits in parsing, robustness, and integration, there is growing debate on whether it also potentially comes at the cost of performance as well as reasoning abilities. Research in this area should be taken with a grain of salt since findings are mixed and often depend on the specific task and model family at hand furthermore model families are not always comparable and are getting updated by the day! Nonetheless, early findings provide some interesting insights as to why there is no one-size-fits-all solution when it comes to LLMs structured output.

        There is some evidence indicating that LLMs may have bias in their handling of different output formats [Long et al., 2024]. The study examined common output structures like multiple-choice answers, wrapped text, lists, and key-value mappings. The authors analyzed key LLM model families, namely Gemma, Mistral, and ChatGPT, uncovering bias across multiple tasks and formats. The researchers attributed these biases to the models’ underlying token distributions for different formats. An example of this format bias emerged in the comparison between JSON and YAML outputs. While models like Mistral and Gemma excelled at generating JSON structures, they performed notably worse with YAML. Their YAML outputs often contained extraneous information that degrades output quality. This disparity likely stems from JSON’s prevalence in training data, highlighting how a format’s popularity directly influences model performance. While the studied models can be probably considered outdated by now since models are getting updated on a rapidly fashion, it is important to remark that addressing format bias is critical for advancing LLMs and ensuring their reliable application in real-world scenarios.

        Recent research “Let Me Speak Freely? A Study on the Impact of Format Restrictions on Performance of Large Language Models” [Tam et al., 2024] suggests that imposing format restrictions on LLMs might impact their performance, particularly in reasoning-intensive tasks. Further evidence [Aider, 2024] suggests LLMs may produce lower quality code if they’re asked to return it as part of a structured JSON response, in particular:

        @@ -1223,16 +1232,16 @@

        -

        4.6. Conclusion

        +

        4.6. Conclusion

        Extracting structured output from LLMs is crucial for integrating them into real-world applications. By understanding the challenges and employing appropriate strategies and tools, developers can improve the reliability and usability of LLM-powered systems, unlocking their potential to automate complex tasks and generate valuable insights.

        Prompt engineering and the use of fine-tuned models can help control the output of LLMs. However, when strong guarantees are needed, practitioners should consider techniques such as logit post-processing either by manually adjusting the model’s output logits or using frameworks like Outlines that provider a higher level of control over the generation process.

        -

        4.7. Acknowledgements

        +

        4.7. Acknowledgements

        We would like to thank Cameron Pfiffer from the .txt team for his insightful review and feedback.

        -

        4.8. Citation

        +

        4.8. Citation

        CC BY-NC-SA 4.0

        @misc{tharsistpsouza2024tamingllms,
           author = {Tharsis T. P. Souza},
        @@ -1246,7 +1255,7 @@ 

        -

        4.9. References

        +

        4.9. References

        [Aid24] @@ -1339,8 +1348,8 @@

        3. The Evals Gap

        Table 4.1 Structured Output Frameworks Comparison
        \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
        VariableForecast
        0Real global GDP (% y/y annualized)3.20
        1Real U.S. GDP (% q/q annualized)2.40
        2CPI inflation (% y/y)2.50
        3Core CPI inflation (% y/y)3.00
        4Unemployment rate (%)4.30
        5Fed funds rate, end period (%)3.88
        \n", + "

    " + ], + "text/plain": [ + " Variable Forecast\n", + "0 Real global GDP (% y/y annualized) 3.20\n", + "1 Real U.S. GDP (% q/q annualized) 2.40\n", + "2 CPI inflation (% y/y) 2.50\n", + "3 Core CPI inflation (% y/y) 3.00\n", + "4 Unemployment rate (%) 4.30\n", + "5 Fed funds rate, end period (%) 3.88" + ] + }, + "execution_count": 97, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_md_forecasts" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
    \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
    VariableForecast
    0Real global GDP (% y/y annualized)3.20
    1Real U.S. GDP (% q/q annualized)2.40
    2CPI inflation (% y/y)2.50
    3Core CPI inflation (% y/y)3.00
    4Unemployment rate (%)4.30
    5Fed funds rate, end period (%)3.88
    \n", + "
    " + ], + "text/plain": [ + " Variable Forecast\n", + "0 Real global GDP (% y/y annualized) 3.20\n", + "1 Real U.S. GDP (% q/q annualized) 2.40\n", + "2 CPI inflation (% y/y) 2.50\n", + "3 Core CPI inflation (% y/y) 3.00\n", + "4 Unemployment rate (%) 4.30\n", + "5 Fed funds rate, end period (%) 3.88" + ] + }, + "execution_count": 98, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_docling_forecasts" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The results from MarkItDown and Docling are identical and accurately match the true values from the document. This demonstrates that despite MarkItDown's output appearing less readable from a human perspective, both approaches enabled the LLM to successfully extract the economic forecast data with equal accuracy, in this particular case." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, let's focus on the asset class weightings. We will extract the asset class weightings from the document and compare the results from MarkItDown and Docling. The information now is presented in a quite different structure. The CIO view information is represented in a spectrum from starting with \"Underweight\", passing through \"Neutral\" and reaching \"Overweight\". The actual view is marked by some colored dots in the chart. Let's see if we can extract this information from the document.\n", + "```{figure} ../_static/input/asset_class.png\n", + "---\n", + "name: asset_class\n", + "alt: Asset Class Weightings\n", + "scale: 50%\n", + "align: center\n", + "---\n", + "Asset Class Weightings\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The user will simply define the following data to extract: \"Asset Class Weightings (as of 12/3/2024) in a scale from -2 to 2\". In that way, we expect that \"Underweight\" will be mapped to -2, \"Neutral\" to 0 and \"Overweight\" to 2 with some values in between." + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [], + "source": [ + "extract_prompt = \"Asset Class Weightings (as of 12/3/2024) in a scale from -2 to 2\"\n", + "asset_class_docling = extract_from_doc(extract_prompt, forecast_result_docling, client)\n", + "asset_class_md = extract_from_doc(extract_prompt, forecast_result_md, client)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "df_md = pd.DataFrame([(f.financial_variable, f.financial_forecast) for f in asset_class_md.forecasts], \n", + " columns=['Variable', 'Forecast'])\n", + "df_docling = pd.DataFrame([(f.financial_variable, f.financial_forecast) for f in asset_class_docling.forecasts], \n", + " columns=['Variable', 'Forecast'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we construct a DataFrame to compare the results from MarkItDown and Docling with an added \"true_value\" column containing the true values from the document, which we extracted manually from the chart." + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
    \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
    variablemarkitdowndoclingtrue_value
    0Global Equities1.01.01.0
    1U.S. Large Cap Growth1.01.00.0
    2U.S. Large Cap Value1.01.01.0
    3U.S. Small Cap Growth1.01.01.0
    4U.S. Small Cap Value1.01.01.0
    5International Developed1.0-1.0-1.0
    6Emerging Markets1.00.00.0
    7Global Fixed Income-1.0-1.0-1.0
    8U.S. Governments-1.01.01.0
    9U.S. Mortgages-1.01.01.0
    10U.S. Corporates-1.0-1.0-1.0
    11International Fixed Income-1.00.00.0
    12High Yield-1.0-1.0-1.0
    13U.S. Investment-grade-1.00.00.0
    14Tax Exempt U.S. High Yield Tax Exempt-1.0-1.0-1.0
    \n", + "
    " + ], + "text/plain": [ + " variable markitdown docling true_value\n", + "0 Global Equities 1.0 1.0 1.0\n", + "1 U.S. Large Cap Growth 1.0 1.0 0.0\n", + "2 U.S. Large Cap Value 1.0 1.0 1.0\n", + "3 U.S. Small Cap Growth 1.0 1.0 1.0\n", + "4 U.S. Small Cap Value 1.0 1.0 1.0\n", + "5 International Developed 1.0 -1.0 -1.0\n", + "6 Emerging Markets 1.0 0.0 0.0\n", + "7 Global Fixed Income -1.0 -1.0 -1.0\n", + "8 U.S. Governments -1.0 1.0 1.0\n", + "9 U.S. Mortgages -1.0 1.0 1.0\n", + "10 U.S. Corporates -1.0 -1.0 -1.0\n", + "11 International Fixed Income -1.0 0.0 0.0\n", + "12 High Yield -1.0 -1.0 -1.0\n", + "13 U.S. Investment-grade -1.0 0.0 0.0\n", + "14 Tax Exempt U.S. High Yield Tax Exempt -1.0 -1.0 -1.0" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Create DataFrame with specified columns\n", + "df_comparison = pd.DataFrame({\n", + " 'variable': df_docling['Variable'].iloc[:-1],\n", + " 'markitdown': df_md['Forecast'],\n", + " 'docling': df_docling['Forecast'].iloc[:-1], # Drop last row\n", + " 'true_value': [1.0, 0.0, 1.0, 1.0, 1.0, -1.0, 0.0, -1.0, 1.0, 1.0, -1.0, 0.0, -1.0, 0.0, -1.0]\n", + "})\n", + "\n", + "display(df_comparison)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Markitdown accuracy: 53.33%\n", + "Docling accuracy: 93.33%\n" + ] + } + ], + "source": [ + "# Calculate accuracy for markitdown and docling\n", + "markitdown_accuracy = (df_comparison['markitdown'] == df_comparison['true_value']).mean()\n", + "docling_accuracy = (df_comparison['docling'] == df_comparison['true_value']).mean()\n", + "\n", + "print(f\"Markitdown accuracy: {markitdown_accuracy:.2%}\")\n", + "print(f\"Docling accuracy: {docling_accuracy:.2%}\") \n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Docling performs significantly better at 93.33% accuracy missing only one value. MarkItDown achieves 53.33% accuracy, struggling with nuanced asset class weightings. In this case, Docling's structured parsed output did help the LLM to extract the information more accurately compared to MarkItDown's unstructured output. Hence, in this case, the strategy used to parse the data did impact the LLM's ability to extract the information. A more robust analysis would run data extraction on a large sample data a number of repeated runs to estimate error rates." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "What if we want to systematically extract all tables from the document? We can use Docling to do that by simply accessing the `tables` attribute of the `DocumentConverter` object.\n", + "\n", + "By doing that, we observe that Docling extracted 7 tables from the document. Exporting tables from top down and left to right in order of appearance in the document.\n", + "Below, we can see the first table successfully extracted for Equities forecasts, the second one for Fixed Income forecasts as well as the last table, which contains CIO Equity Sector Views.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [], + "source": [ + "import time\n", + "from pathlib import Path\n", + "import pandas as pd\n", + "from docling.document_converter import DocumentConverter" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [], + "source": [ + "def convert_and_export_tables(file_path: Path) -> list[pd.DataFrame]:\n", + " \"\"\"\n", + " Convert document and export tables to DataFrames.\n", + " \n", + " Args:\n", + " file_path: Path to input document\n", + " \n", + " Returns:\n", + " List of pandas DataFrames containing the tables\n", + " \"\"\"\n", + " doc_converter = DocumentConverter()\n", + " start_time = time.time()\n", + " \n", + " conv_res = doc_converter.convert(file_path)\n", + " \n", + " tables = []\n", + " # Export tables\n", + " for table in conv_res.document.tables:\n", + " table_df: pd.DataFrame = table.export_to_dataframe()\n", + " tables.append(table_df)\n", + "\n", + " end_time = time.time() - start_time\n", + " print(f\"Document converted in {end_time:.2f} seconds.\")\n", + " \n", + " return tables\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Convert and export tables\n", + "tables = convert_and_export_tables(Path(FORECAST_FILE_PATH))" + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "7" + ] + }, + "execution_count": 100, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(tables)" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
    \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
    Total Return in USD (%).CurrentTotal Return in USD (%).WTDTotal Return in USD (%).MTDTotal Return in USD (%).YTD
    0DJIA43,828.06-1.8-2.318.4
    1NASDAQ19,926.720.43.733.7
    2S&P 5006,051.09-0.60.428.6
    3S&P 400 Mid Cap3,277.20-1.6-2.619.5
    4Russell 20002,346.90-2.5-3.517.3
    5MSCI World3,817.24-1.00.222.1
    6MSCI EAFE2,319.05-1.50.26.4
    7MSCI Emerging Markets1,107.010.32.710.6
    \n", + "
    " + ], + "text/plain": [ + " Total Return in USD (%).Current \\\n", + "0 DJIA 43,828.06 \n", + "1 NASDAQ 19,926.72 \n", + "2 S&P 500 6,051.09 \n", + "3 S&P 400 Mid Cap 3,277.20 \n", + "4 Russell 2000 2,346.90 \n", + "5 MSCI World 3,817.24 \n", + "6 MSCI EAFE 2,319.05 \n", + "7 MSCI Emerging Markets 1,107.01 \n", + "\n", + " Total Return in USD (%).WTD Total Return in USD (%).MTD \\\n", + "0 -1.8 -2.3 \n", + "1 0.4 3.7 \n", + "2 -0.6 0.4 \n", + "3 -1.6 -2.6 \n", + "4 -2.5 -3.5 \n", + "5 -1.0 0.2 \n", + "6 -1.5 0.2 \n", + "7 0.3 2.7 \n", + "\n", + " Total Return in USD (%).YTD \n", + "0 18.4 \n", + "1 33.7 \n", + "2 28.6 \n", + "3 19.5 \n", + "4 17.3 \n", + "5 22.1 \n", + "6 6.4 \n", + "7 10.6 " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "display(tables[0])" + ] + }, + { + "cell_type": "code", + "execution_count": 102, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
    \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
    Total Return in USD (%).CurrentTotal Return in USD (%).WTDTotal Return in USD (%).MTDTotal Return in USD (%).YTD
    0Corporate & Government4.66-1.34-0.921.94
    1Agencies4.54-0.58-0.313.35
    2Municipals3.55-0.87-0.541.99
    3U.S. Investment Grade Credit4.79-1.38-0.931.97
    4International5.17-1.40-0.903.20
    5High Yield7.19-0.220.208.87
    690 Day Yield4.324.394.495.33
    72 Year Yield4.244.104.154.25
    810 Year Yield4.404.154.173.88
    930 Year Yield4.604.344.364.03
    \n", + "
    " + ], + "text/plain": [ + " Total Return in USD (%).Current \\\n", + "0 Corporate & Government 4.66 \n", + "1 Agencies 4.54 \n", + "2 Municipals 3.55 \n", + "3 U.S. Investment Grade Credit 4.79 \n", + "4 International 5.17 \n", + "5 High Yield 7.19 \n", + "6 90 Day Yield 4.32 \n", + "7 2 Year Yield 4.24 \n", + "8 10 Year Yield 4.40 \n", + "9 30 Year Yield 4.60 \n", + "\n", + " Total Return in USD (%).WTD Total Return in USD (%).MTD \\\n", + "0 -1.34 -0.92 \n", + "1 -0.58 -0.31 \n", + "2 -0.87 -0.54 \n", + "3 -1.38 -0.93 \n", + "4 -1.40 -0.90 \n", + "5 -0.22 0.20 \n", + "6 4.39 4.49 \n", + "7 4.10 4.15 \n", + "8 4.15 4.17 \n", + "9 4.34 4.36 \n", + "\n", + " Total Return in USD (%).YTD \n", + "0 1.94 \n", + "1 3.35 \n", + "2 1.99 \n", + "3 1.97 \n", + "4 3.20 \n", + "5 8.87 \n", + "6 5.33 \n", + "7 4.25 \n", + "8 3.88 \n", + "9 4.03 " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "display(tables[1])" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
    \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
    SectorCIO View.CIO View.UnderweightCIO View.NeutralCIO View.CIO View.Overweight
    0Utilitiesslight over weight green 
    1Financialsslight over weight green 
    2Healthcareslight over weight green 
    3Consumer DiscretionarySlight over weight green 
    4Information TechnologyNeutral yellow 
    5Communication ServicesNeutral yellow 
    6IndustrialsNeutral yellow 
    7Real EstateNeutral yellow 
    8Energyslight underweight orange 
    9Materialsslight underweight orange 
    10Consumer Staplesunderweight red
    \n", + "
    " + ], + "text/plain": [ + " Sector CIO View. \\\n", + "0 Utilities slight over weight green  \n", + "1 Financials slight over weight green  \n", + "2 Healthcare slight over weight green  \n", + "3 Consumer Discretionary Slight over weight green  \n", + "4 Information Technology Neutral yellow  \n", + "5 Communication Services Neutral yellow  \n", + "6 Industrials Neutral yellow  \n", + "7 Real Estate Neutral yellow  \n", + "8 Energy slight underweight orange  \n", + "9 Materials slight underweight orange  \n", + "10 Consumer Staples underweight red \n", + "\n", + " CIO View.Underweight CIO View.Neutral CIO View. CIO View.Overweight \n", + "0    \n", + "1    \n", + "2    \n", + "3    \n", + "4    \n", + "5    \n", + "6    \n", + "7    \n", + "8    \n", + "9    \n", + "10     " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "display(tables[6])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Coming back to MarkItDown, one interesting feature to explore is the ability to extract information from images by passing an image capable LLM model to its constructor." + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [], + "source": [ + "md_llm = MarkItDown(llm_client=client, llm_model=\"gpt-4o-mini\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "result = md_llm.convert(\"../data/input/forecast.png\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here's the description we obtain from the image of our input document." + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "\n", + "# Description:\n", + "**Markets in Review: Economic Forecasts and Asset Class Weightings (as of 12/13/2024)**\n", + "\n", + "This detailed market overview presents key performance metrics and economic forecasts as of December 13, 2024.\n", + "\n", + "**Equities Overview:**\n", + "- **Total Returns:** Highlights returns for major indices such as the DJIA (18.4% YTD), NASDAQ (33.7% YTD), and S&P 500 (28.6% YTD), showcasing strong performance across the board.\n", + "- **Forecasts:** Economic indicators reveal a projected real global GDP growth of 3.1%, with inflation rates expected to stabilize around 2.2% in 2025. Unemployment rates are anticipated to remain low at 4.4%.\n", + "\n", + "**Fixed Income:**\n", + "- Focuses on various segments, including Corporate & Government bonds, which offer an annualized return of 4.66% and indicate shifting trends in interest rates over 2-Year (4.25%) and 10-Year (4.03%) bonds.\n", + "\n", + "**Commodities & Currencies:**\n", + "- Commodities such as crude oil and gold show varied performance, with oil increasing by 4.8% and gold prices sitting at $2,648.23 per ounce.\n", + "- Currency metrics highlight the Euro and USD trends over the past year.\n", + "\n", + "**S&P Sector Returns:**\n", + "- A quick reference for sector performance indicates a significant 2.5% return in Communication Services, while other sectors like Consumer Staples and Materials display minor fluctuations.\n", + "\n", + "**CIO Asset Class Weightings:**\n", + "- Emphasizes strategic asset allocation recommendations which are crucial for an investor's portfolio. Underweight positions in U.S. Small Cap Growth and International Developed contrast with overweight positions in certain sectors such as Utilities and Financials, signaling tactical shifts based on ongoing economic assessments.\n", + "\n", + "**Note:** This summary is sourced from BofA Global Research and aims to provide a comprehensive view of current market conditions and forecasts to assist investors in making informed decisions.\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "display(Markdown(result.text_content))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "\n", + "Overall, the description is somewhat accurate but contains a few inaccuracies including:\n", + "\n", + "- For the sector weightings, the description states there are \"underweight positions in U.S. Small Cap Growth\" but looking at the Asset Class Weightings chart, U.S. Small Cap Growth actually shows an overweight position (green circle).\n", + "- The description mentions \"overweight positions in certain sectors such as Utilities and Financials\" but looking at the CIO Equity Sector Views, both these sectors show neutral positions, not overweight positions.\n", + "- For fixed income, the description cites a \"10-Year (4.03%)\" yield, but the image shows the 30-Year Yield at 4.03%, while the 10-Year Yield is actually 4.40%.\n", + "\n", + "Arguably, the description's inaccuracies could be a consequence of the underlying LLM model's inability to process the image. Further research is needed to determine if this is the case." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Retrieval-Augmented Generation\n", + "\n", + "RAG is a technique that allows LLMs to retrieve information from a knowledge base to answer questions. It is a popular technique for building LLM applications that require knowledge-intensive tasks {cite}`lewis2021retrievalaugmentedgenerationknowledgeintensivenlp`.\n", + "\n", + "RAG utilizes a retrieval system to fetch external knowledge and augment the LLM. It has proved effective in mitigating hallucinations of LLMs {cite}`10.1145/3589334.3645481, ni-etal-2024-llms`." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Case Studies\n", + "\n", + "This section presents three case studies that demonstrate practical solutions to common LLM limitations:\n", + "\n", + "First, Content Chunking with Contextual Linking showcases how intelligent chunking strategies can overcome both context window and output token limitations. This case study illustrates techniques for breaking down and reassembling content while maintaining coherence, enabling the generation of high-quality long-form outputs despite model constraints.\n", + "\n", + "Second, a Retrieval Augmented Generation case study addresses the challenge of stale or outdated model knowledge. By implementing semantic search over a GitHub repository, this example demonstrates how to augment LLM responses with current, accurate information - allowing users to query and receive up-to-date answers about code repository contents.\n", + "\n", + "Third, the final case study builds a Quiz generator with citations. This case study explores some additional input management techniques that become particularly useful when long context window is available. This includes implementing prompt caching for efficiency and adding citations to enhance response accuracy and verifiability. These approaches show how to maximize the benefits of larger context models while maintaining response quality." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Case Study I: Content Chunking with Contextual Linking\n", + "\n", + "Content chunking with contextual linking is a technique to break down long-form content into smaller, manageable chunks while keeping chunk-specific context. This approach tackles three problems:\n", + "1. The LLM's inability to process long inputs to do context-size limits\n", + "2. The LLM's inability to generate long-form content due to the `max_output_tokens` limitation.\n", + "3. The LLM's inability to maintain coherence and context when generating responses per chunks\n", + "\n", + "Here, we exemplify this technique by following these steps:\n", + "1. **Chunking the Content**: The input content is split into smaller chunks. This allows the LLM to process each chunk individually, focusing on generating a complete and detailed response for that specific section of the input.\n", + "\n", + "2. **Maintaining Context**: Each chunk is linked with contextual information from the previous chunks. This helps in maintaining the flow and coherence of the content across multiple chunks.\n", + "\n", + "3. **Generating Linked Prompts**: For each chunk, a prompt is generated that includes the chunk's content and its context. This prompt is then used to generate the output for that chunk.\n", + "\n", + "4. **Combining the Outputs**: The outputs of all chunks are combined to form the final long-form content.\n", + "\n", + "Let's examine an example implementation of this technique.\n", + "\n", + "#### Generating long-form content\n", + "\n", + "- Goal: Generate a long-form report analyzing a company's financial statement.\n", + "- Input: A company's 10K SEC filing.\n", + "\n", + "```{figure} ../_static/structured_output/diagram1.png\n", + "---\n", + "name: content-chunking-with-contextual-linking\n", + "alt: Content Chunking with Contextual Linking\n", + "scale: 50%\n", + "align: center\n", + "---\n", + "Content Chunking with Contextual Linking Schematic Representation.\n", + "```\n", + "\n", + "The diagram in {numref}`content-chunking-with-contextual-linking` illustrates the process we will follow for handling long-form content generation with Large Language Models through \"Content Chunking with Contextual Linking.\" It shows how input content is first split into manageable chunks using a chunking function (e.g. `CharacterTextSplitter` with `tiktoken` tokenizer), then each chunk is processed sequentially while maintaining context from previous chunks. For each chunk, the system updates the context, generates a dynamic prompt with specific parameters, makes a call to the LLM chain, and stores the response. After all chunks are processed, the individual responses are combined with newlines to create the final report, effectively working around the token limit constraints of LLMs while maintaining coherence across the generated content.\n", + "\n", + "**Step 1: Chunking the Content**\n", + "\n", + "There are different methods for chunking, and each of them might be appropriate for different situations. However, we can broadly group chunking strategies in two types:\n", + "- **Fixed-size Chunking**: This is the most common and straightforward approach to chunking. We simply decide the number of tokens in our chunk and, optionally, whether there should be any overlap between them. In general, we will want to keep some overlap between chunks to make sure that the semantic context doesn’t get lost between chunks. Fixed-sized chunking may be a reasonable path in many common cases. Compared to other forms of chunking, fixed-sized chunking is computationally cheap and simple to use since it doesn’t require the use of any specialied techniques or libraries.\n", + "- **Content-aware Chunking**: These are a set of methods for taking advantage of the nature of the content we’re chunking and applying more sophisticated chunking to it. Examples include:\n", + " - **Sentence Splitting**: Many models are optimized for embedding sentence-level content. Naturally, we would use sentence chunking, and there are several approaches and tools available to do this, including naive splitting (e.g. splitting on periods), NLTK, and spaCy.\n", + " - **Recursive Chunking**: Recursive chunking divides the input text into smaller chunks in a hierarchical and iterative manner using a set of separators.\n", + " - **Semantic Chunking**: This is a class of methods that leverages embeddings to extract the semantic meaning present in your data, creating chunks that are made up of sentences that talk about the same theme or topic.\n", + "\n", + " Here, we will utilize `langchain` for a content-aware sentence-splitting strategy for chunking. Langchain offers several text splitters {cite}`langchain_text_splitters` such as JSON-, Markdown- and HTML-based or split by token. We will use the `CharacterTextSplitter` with `tiktoken` as our tokenizer to count the number of tokens per chunk which we can use to ensure that we do not surpass the input token limit of our model.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def get_chunks(text: str, chunk_size: int, chunk_overlap: int) -> list:\n", + " \"\"\"\n", + " Split input text into chunks of specified size with specified overlap.\n", + "\n", + " Args:\n", + " text (str): The input text to be chunked.\n", + " chunk_size (int): The maximum size of each chunk in tokens.\n", + " chunk_overlap (int): The number of tokens to overlap between chunks.\n", + "\n", + " Returns:\n", + " list: A list of text chunks.\n", + " \"\"\"\n", + " from langchain_text_splitters import CharacterTextSplitter\n", + "\n", + " text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size, chunk_overlap=chunk_overlap)\n", + " return text_splitter.split_text(text)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Step 2: Writing the Base Prompt Template**\n", + "\n", + "We will write a base prompt template which will serve as a foundational structure for all chunks, ensuring consistency in the instructions and context provided to the language model. The template includes the following parameters:\n", + "- `role`: Defines the role or persona the model should assume.\n", + "- `context`: Provides the background information or context for the task.\n", + "- `instruction`: Specifies the task or action the model needs to perform.\n", + "- `input_text`: Contains the actual text input that the model will process.\n", + "- `requirements`: Lists any specific requirements or constraints for the output." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_core.prompts import PromptTemplate\n", + "def get_base_prompt_template() -> str:\n", + " \n", + " base_prompt = \"\"\"\n", + " ROLE: {role}\n", + " CONTEXT: {context}\n", + " INSTRUCTION: {instruction}\n", + " INPUT: {input}\n", + " REQUIREMENTS: {requirements}\n", + " \"\"\"\n", + " \n", + " prompt = PromptTemplate.from_template(base_prompt)\n", + " return prompt" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We will write a simple function that returns an `LLMChain` which is a simple `langchain` construct that allows you to chain together a combination of prompt templates, language models and output parsers." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_core.output_parsers import StrOutputParser\n", + "from langchain_community.chat_models import ChatLiteLLM\n", + "\n", + "def get_llm_chain(prompt_template: str, model_name: str, temperature: float = 0):\n", + " \"\"\"\n", + " Returns an LLMChain instance using langchain.\n", + "\n", + " Args:\n", + " prompt_template (str): The prompt template to use.\n", + " model_name (str): The name of the model to use.\n", + " temperature (float): The temperature setting for the model.\n", + "\n", + " Returns:\n", + " llm_chain: An instance of the LLMChain.\n", + " \"\"\"\n", + " \n", + " from dotenv import load_dotenv\n", + " import os\n", + "\n", + " # Load environment variables from .env file\n", + " load_dotenv()\n", + " \n", + " api_key_label = model_name.split(\"/\")[0].upper() + \"_API_KEY\"\n", + " llm = ChatLiteLLM(\n", + " model=model_name,\n", + " temperature=temperature,\n", + " api_key=os.environ[api_key_label],\n", + " )\n", + " llm_chain = prompt_template | llm | StrOutputParser()\n", + " return llm_chain" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Step 3: Constructing Dynamic Prompt Parameters**\n", + "\n", + "Now, we will write a function (`get_dynamic_prompt_template`) that constructs prompt parameters dynamically for each chunk." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from typing import Dict\n", + "def get_dynamic_prompt_params(prompt_params: Dict, \n", + " part_idx: int, \n", + " total_parts: int,\n", + " chat_context: str,\n", + " chunk: str) -> str:\n", + " \"\"\"\n", + " Construct prompt template dynamically per chunk while maintaining the chat context of the response generation.\n", + " \n", + " Args:\n", + " prompt_params (Dict): Original prompt parameters\n", + " part_idx (int): Index of current conversation part\n", + " total_parts (int): Total number of conversation parts\n", + " chat_context (str): Chat context from previous parts\n", + " chunk (str): Current chunk of text to be processed\n", + " Returns:\n", + " str: Dynamically constructed prompt template with part-specific params\n", + " \"\"\"\n", + " dynamic_prompt_params = prompt_params.copy()\n", + " # saves the chat context from previous parts\n", + " dynamic_prompt_params[\"context\"] = chat_context\n", + " # saves the current chunk of text to be processed as input\n", + " dynamic_prompt_params[\"input\"] = chunk\n", + " \n", + " # Add part-specific instructions\n", + " if part_idx == 0: # Introduction part\n", + " dynamic_prompt_params[\"instruction\"] = f\"\"\"\n", + " You are generating the Introduction part of a long report.\n", + " Don't cover any topics yet, just define the scope of the report.\n", + " \"\"\"\n", + " elif part_idx == total_parts - 1: # Conclusion part\n", + " dynamic_prompt_params[\"instruction\"] = f\"\"\"\n", + " You are generating the last part of a long report. \n", + " For this part, first discuss the below INPUT. Second, write a \"Conclusion\" section summarizing the main points discussed given in CONTEXT.\n", + " \"\"\"\n", + " else: # Main analysis part\n", + " dynamic_prompt_params[\"instruction\"] = f\"\"\"\n", + " You are generating part {part_idx+1} of {total_parts} parts of a long report.\n", + " For this part, analyze the below INPUT.\n", + " Organize your response in a way that is easy to read and understand either by creating new or merging with previously created structured sections given in CONTEXT.\n", + " \"\"\"\n", + " \n", + " return dynamic_prompt_params" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "**Step 4: Generating the Report**\n", + "\n", + "Finally, we will write a function that generates the actual report by calling the `LLMChain` with the dynamically updated prompt parameters for each chunk and concatenating the results at the end." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def generate_report(input_content: str, llm_model_name: str, \n", + " role: str, requirements: str,\n", + " chunk_size: int, chunk_overlap: int) -> str:\n", + " # stores the parts of the report, each generated by an individual LLM call\n", + " report_parts = [] \n", + " # split the input content into chunks\n", + " chunks = get_chunks(input_content, chunk_size, chunk_overlap)\n", + " # initialize the chat context with the input content\n", + " chat_context = input_content\n", + " # number of parts to be generated\n", + " num_parts = len(chunks)\n", + "\n", + " prompt_params = {\n", + " \"role\": role, # user-provided\n", + " \"context\": \"\", # dinamically updated per part\n", + " \"instruction\": \"\", # dynamically updated per part\n", + " \"input\": \"\", # dynamically updated per part\n", + " \"requirements\": requirements #user-priovided\n", + " }\n", + "\n", + " # get the LLMChain with the base prompt template\n", + " llm_chain = get_llm_chain(get_base_prompt_template(), \n", + " llm_model_name)\n", + "\n", + " # dynamically update prompt_params per part\n", + " print(f\"Generating {num_parts} report parts\")\n", + " for i, chunk in enumerate(chunks):\n", + " dynamic_prompt_params = get_dynamic_prompt_params(\n", + " prompt_params,\n", + " part_idx=i,\n", + " total_parts=num_parts,\n", + " chat_context=chat_context,\n", + " chunk=chunk\n", + " )\n", + " \n", + " # invoke the LLMChain with the dynamically updated prompt parameters\n", + " response = llm_chain.invoke(dynamic_prompt_params)\n", + "\n", + " # update the chat context with the cummulative response\n", + " if i == 0:\n", + " chat_context = response\n", + " else:\n", + " chat_context = chat_context + response\n", + " \n", + " print(f\"Generated part {i+1}/{num_parts}.\")\n", + " report_parts.append(response)\n", + "\n", + " report = \"\\n\".join(report_parts)\n", + " return report" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Example Usage**\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load the text from sample 10K SEC filing\n", + "with open('../data/apple.txt', 'r') as file:\n", + " text = file.read()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Define the chunk and chunk overlap size\n", + "MAX_CHUNK_SIZE = 10000\n", + "MAX_CHUNK_OVERLAP = 0" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "report = generate_report(text, llm_model_name=\"gemini/gemini-1.5-flash-latest\", \n", + " role=\"Financial Analyst\", \n", + " requirements=\"The report should be in a readable, structured format, easy to understand and follow. Focus on finding risk factors and market moving insights.\",\n", + " chunk_size=MAX_CHUNK_SIZE, \n", + " chunk_overlap=MAX_CHUNK_OVERLAP)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Save the generated report to a local file\n", + "with open('data/apple_report.txt', 'w') as file:\n", + " file.write(report)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 109, + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "**Introduction**\n", + "\n", + "This report provides a comprehensive analysis of Apple Inc.'s financial performance and position for the fiscal year ended September 28, 2024, as disclosed in its Form 10-K filing with the United States Securities and Exchange Commission. The analysis will focus on identifying key risk factors impacting Apple's business, evaluating its financial health, and uncovering market-moving insights derived from the provided data. The report will delve into Apple's various segments, product lines, and services, examining their performance and contributions to overall financial results. Specific attention will be paid to identifying trends, potential challenges, and opportunities for future growth. The analysis will also consider the broader macroeconomic environment and its influence on Apple's operations and financial outlook. Finally, the report will incorporate relevant information from Apple's definitive proxy statement for its 2025 annual meeting of shareholders, as incorporated by reference in the Form 10-K.\n", + "\n", + "**PART 2: Key Risk Factors and Market-Moving Insights**\n", + "\n", + "This section analyzes key risk factors disclosed in Apple Inc.'s 2024 Form 10-K, focusing on their potential impact on financial performance and identifying potential market-moving insights. The analysis is structured around the major risk categories identified in the filing.\n", + "\n", + "**2.1 Dependence on Third-Party Developers:**\n", + "\n", + "Apple's success is heavily reliant on the continued support and innovation of third-party software developers. The Form 10-K highlights several critical aspects of this dependence:\n", + "\n", + "* **Market Share Vulnerability:** Apple's relatively smaller market share in smartphones, personal computers, and tablets compared to competitors (Android, Windows, gaming consoles) could discourage developers from prioritizing Apple's platform, leading to fewer high-quality apps and potentially impacting customer purchasing decisions. This is a significant risk, especially given the rapid pace of technological change. A decline in app availability or quality could negatively impact sales and market share. **Market-moving insight:** Monitoring developer activity and app quality across competing platforms is crucial for assessing this risk. Any significant shift in developer focus away from iOS could be a negative market signal.\n", + "\n", + "* **App Store Dynamics:** While Apple allows developers to retain most App Store revenue, its commission structure and recent changes (e.g., complying with the Digital Markets Act (DMA) in the EU) introduce uncertainty. Changes to the App Store's policies or fee structures could materially affect Apple's revenue and profitability. **Market-moving insight:** Closely monitoring regulatory developments (especially concerning the DMA) and their impact on App Store revenue is essential. Any significant changes to Apple's App Store policies or revenue streams could trigger market reactions.\n", + "\n", + "* **Content Acquisition and Creation:** Apple's reliance on third-party digital content providers for its services introduces risks related to licensing agreements, competition, and pricing. The cost of producing its own digital content is also increasing due to competition for talent and subscribers. Failure to secure or create appealing content could negatively impact user engagement and revenue. **Market-moving insight:** Analyzing the success of Apple's original content initiatives and the renewal rates of third-party content agreements will provide insights into this risk.\n", + "\n", + "**2.2 Operational Risks:**\n", + "\n", + "\n", + " (...) \n", + "\n", + " The reconciliation of segment operating income to consolidated operating income reveals that research and development (R&D) and other corporate expenses significantly impact overall profitability. While increased R&D is generally positive, it reduces short-term profits. The geographical breakdown of net sales and long-lived assets further emphasizes the concentration of Apple's business in the U.S. and China. **Market-moving insight:** Continued weakness in the Greater China market, sustained flat iPhone sales, or any significant changes in R&D spending should be closely monitored for their potential impact on Apple's financial performance and investor sentiment.\n", + "\n", + "\n", + "**5.4 Auditor's Report and Internal Controls:**\n", + "\n", + "The auditor's report expresses an unqualified opinion on Apple's financial statements and internal control over financial reporting. However, it identifies uncertain tax positions as a critical audit matter. The significant amount of unrecognized tax benefits ($22.0 billion) and the complexity involved in evaluating these positions highlight a substantial risk. Management's assessment of these positions involves significant judgment and relies on interpretations of complex tax laws. Apple's management also asserts that its disclosure controls and procedures are effective. **Market-moving insight:** Any changes in tax laws, unfavorable rulings on uncertain tax positions, or weaknesses in internal controls could materially affect Apple's financial results and investor confidence.\n", + "\n", + "\n", + "**Conclusion**\n", + "\n", + "This report provides a comprehensive analysis of Apple Inc.'s financial performance and position for fiscal year 2024. While Apple maintains a strong financial position with substantial cash reserves and a robust capital return program, several key risk factors could significantly impact its future performance. These risks include:\n", + "\n", + "* **Dependence on third-party developers:** A shift in developer focus away from iOS or changes to the App Store's policies could negatively impact Apple's revenue and profitability.\n", + "* **Operational risks:** Employee retention challenges, reseller dependence, and cybersecurity threats pose significant operational risks.\n", + "* **Legal and regulatory risks:** Ongoing antitrust litigation, the Digital Markets Act (DMA) compliance, and data privacy regulations introduce substantial legal and regulatory uncertainties.\n", + "* **Financial risks:** Volatility in sales and profit margins, foreign exchange rate fluctuations, credit risk, and tax risks could impact Apple's financial performance.\n", + "* **Supply chain concentration:** Apple's reliance on a concentrated network of outsourcing partners, primarily located in a few Asian countries, and dependence on single or limited sources for certain custom components, exposes the company to significant supply chain risks.\n", + "* **Uncertain tax positions:** The significant amount of unrecognized tax benefits represents a substantial uncertainty that could materially affect Apple's financial results.\n", + "\n", + "Despite these risks, Apple's strong liquidity position, continued growth in its Services segment, and robust capital return program provide a degree of resilience. However, investors and analysts should closely monitor the market-moving insights identified throughout this report, including developer activity, regulatory developments, regional economic conditions, supply chain stability, and the resolution of uncertain tax positions, to assess their potential impact on Apple's future performance and valuation. The significant short-term obligations, while manageable given Apple's cash position, highlight the need for continued financial discipline and effective risk management. A deeper, more granular analysis of the financial statements and notes is recommended for a more complete assessment." + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Read and display the generated report\n", + "with open('../data/apple_report.txt', 'r') as file:\n", + " report_content = file.read()\n", + " \n", + "from IPython.display import Markdown\n", + "\n", + "# Display first and last 10% of the report content\n", + "report_lines = report_content.splitlines()\n", + "total_lines = len(report_lines)\n", + "quarter_lines = total_lines // 10\n", + "\n", + "top_portion = '\\n'.join(report_lines[:quarter_lines])\n", + "bottom_portion = '\\n'.join(report_lines[-quarter_lines:])\n", + "\n", + "display(Markdown(f\"{top_portion}\\n\\n (...) \\n\\n {bottom_portion}\"))\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Discussion\n", + "\n", + "Results from the generated report present a few interesting aspects:\n", + "\n", + "- **Coherence**: The generated report demonstrates an apparent level of coherence. The sections are logically structured, and the flow of information is smooth. Each part of the report builds upon the previous sections, providing a comprehensive analysis of Apple Inc.'s financial performance and key risk factors. The use of headings and subheadings helps in maintaining clarity and organization throughout the document.\n", + "\n", + "- **Adherence to Instructions**: The LLM followed the provided instructions effectively. The report is in a readable, structured format, and it focuses on identifying risk factors and market-moving insights as requested. The analysis is detailed and covers various aspects of Apple's financial performance, including revenue segmentation, profitability, liquidity, and capital resources. The inclusion of market-moving insights adds value to the report, aligning with the specified requirements.\n", + "\n", + "Despite the seemingly good quality of the results, there are some limitations to consider:\n", + "\n", + "- **Depth of Analysis**: While the report covers a wide range of topics, the depth of analysis in certain sections may not be as comprehensive as a human expert's evaluation. Some nuances and contextual factors might be overlooked by the LLM. Splitting the report into multiple parts helps in mitigating this issue.\n", + "\n", + "- **Chunking Strategy**: The current approach splits the text into chunks based on size, which ensures that each chunk fits within the model's token limit. However, this method may disrupt the logical flow of the document, as sections of interest might be split across multiple chunks. An alternative approach could be \"structured\" chunking, where the text is divided based on meaningful sections or topics. This would preserve the coherence of each section, making it easier to follow and understand. Implementing structured chunking requires additional preprocessing to identify and segment the text appropriately, but it can significantly enhance the readability and logical flow of the generated report.\n", + "\n", + "Here, we implemented a simple strategy to improve the coherence in output generation given a multi-part chunked input. Many other strategies are possible. One related technique worth mentioning is Anthropic's Contextual Retrieval {cite}`anthropic2024contextualretrieval`. The approach, as shown in {numref}`anth_contextual`, employs an LLM itself to generate relevant context per chunk before passing these two pieces of information together to the LLM. This process was proposed in the context of RAGs to enhance its retrieval capabilities but can be applied more generally to improve output generation.\n", + "```{figure} ../_static/input/anth_contextual.png\n", + "---\n", + "name: anth_contextual\n", + "alt: Anthropic Contextual Linking\n", + "scale: 50%\n", + "align: center\n", + "---\n", + "Anthropic Contextual Linking {cite}`anthropic2024contextualretrieval`.\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Case Study II: Github RAG\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Case Study III: Quiz Generation with Citations\n", + "\n", + "In this case study, we will build a Quiz generator with citations that explores additional input management techniques particularly useful with long context windows. The implementation includes prompt caching for efficiency and citation tracking to enhance accuracy and verifiability. We will use Gemini 1.5 Pro as our LLM model, which has a context window of 2M tokens.\n", + "\n", + "#### Use Case\n", + "\n", + "Let's assume you are a Harvard student enrolled in GOV 1039 \"The Birth of Modern Democracy\" (see {numref}`harvard-class`), you face a daunting reading list for next Tuesday's class on Rights. The readings include foundational documents like the Magna Carta, Declaration of Independence, and US Bill of Rights, each with specific sections to analyze.\n", + "\n", + "```{figure} ../_static/input/harvard.png\n", + "---\n", + "name: harvard-class\n", + "alt: Harvard Class\n", + "scale: 50%\n", + "align: center\n", + "---\n", + "Harvard's Democratic Theory Class\n", + "```\n", + "\n", + "Instead of trudging through these dense historical texts sequentially, we would like to:\n", + "- Extract key insights and connections between these documents, conversationally.\n", + "- Engage with the material through a quiz format.\n", + "- Add citations to help with verifying answers.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Implementation\n", + "\n", + "The full implementation is available at Book's [Github repository](https://github.com/souzatharsis/tamingLLMs/tamingllms/notebooks/src/gemini_duo.py). Here, we will cover the most relevant parts of the implementation.\n", + "\n", + "**Client Class**\n", + "\n", + "First, we will define the `Client` class which will provide the key interface users will interact with. It has the following summarized interface:\n", + "\n", + "- Initialization:\n", + " - `__init__(knowledge_base: List[str] = [])`: Initialize with optional list of URLs as knowledge base\n", + "\n", + "- Core Methods:\n", + " - `add_knowledge_base(urls: List[str]) -> None`: Add URLs to the knowledge base\n", + " - `add(urls: List[str]) -> None`: Extract content from URLs and add to conversation input\n", + " - `msg(msg: str = \"\", add_citations: bool = False) -> str`: Enables users to send messages to the client\n", + " - `quiz(add_citations: bool = True, num_questions: int = 10) -> str`: Generate a quiz based on full input memory\n", + "\n", + "- Key Attributes:\n", + " - `knowledge_base`: List of URLs providing foundation knowledge\n", + " - `input`: Current input being studied (short-term memory)\n", + " - `input_memory`: Cumulative input + knowledge base (long-term memory) \n", + " - `response`: Latest response from LLM\n", + " - `response_memory`: Cumulative responses (long-term memory)\n", + " - `urls_memory`: Cumulative list of processed URLs\n", + "\n", + "\n", + "**Corpus-in-Context Prompting**\n", + "\n", + "The `add()` method is key since it is used to add content to the client. It takes a list of URLs and extracts the content from each URL using a content extractor (using MarkitDown). The content is then added to the conversation input memory in a way that enables citations using the \"Corpus-in-Context\" (CIC) Prompting {cite}`lee2024longcontextlanguagemodelssubsume`.\n", + "\n", + "{numref}`cic` shows how CIC format is used to enable citations. It inserts a corpus into the prompt. Each candidate citable part (e.g., passage, chapter) in a corpus is assigned a unique identifier (ID) that can be referenced as needed for that task.\n", + "\n", + "```{figure} ../_static/input/cic.png\n", + "---\n", + "name: cic\n", + "alt: CIC Format\n", + "scale: 50%\n", + "align: center\n", + "---\n", + "Example of Corpus-in-Context Prompting for retrieval. \n", + "```\n", + "\n", + "CiC prompting leverages LLM's capacity to follow instructions by carefully annotating the corpus with document IDs. It benefits from a strong, capable models to retrieve over large corpora provided in context. \n", + "\n", + "```python\n", + " def add(self, urls: List[str]) -> None:\n", + " self.urls = urls\n", + "\n", + " # Add new content to input following CIC format to enable citations\n", + " for url in urls:\n", + " self.urls_memory.append(url)\n", + " content = self.extractor.convert(url).text_content\n", + " formatted_content = f\"ID: {self.reference_id} | {content} | END ID: {self.reference_id}\"\n", + " self.input += formatted_content + \"\\n\" \n", + " self.reference_id += 1\n", + " \n", + " # Update memory\n", + " self.input_memory = self.input_memory + self.input\n", + "```\n", + "\n", + "The method `add_knowledge_base()` is a simple wrapper around the `add()` method. It is used to add URLs to the knowledge base, which are later cached by the LLM model as we will see later.\n", + "\n", + "```python\n", + " def add_knowledge_base(self, urls: List[str]) -> None:\n", + " self.add(urls)\n", + "```\n", + "\n", + "\n", + "Later, when the user sends a message to the client, the `msg()` method is used to generate a response while enabling citations. `self.content_generator` is an instance of our LLM model, which we will go through next.\n", + "\n", + "```python\n", + " def msg(self, msg: str = \"\", add_citations: bool = False) -> str:\n", + " if add_citations:\n", + " msg = msg + \"\\n\\n For key statements, add Input ID to the response.\"\n", + "\n", + " self.response = self.content_generator.generate(\n", + " input_content=self.input,\n", + " user_instructions=msg\n", + " )\n", + "\n", + " self.response_memory = self.response_memory + self.response.text\n", + "\n", + " return self.response.text\n", + "```\n", + "\n", + "**Prompt Caching**\n", + "\n", + "LLM-based applications often involve repeatedly passing the same input tokens to a model, which can be inefficient and costly. Context caching addresses this by allowing you to cache input tokens after their first use and reference them in subsequent requests. This approach significantly reduces costs compared to repeatedly sending the same token corpus, especially at scale.\n", + "\n", + "In our application, the user might passes a large knowledge base to the client that can be referenced multiple times by smaller user requests. Our `Client` class is composed of a `LLMBackend` class that takes the `input_memory` containing the entire knowledge base and any additional user added content.\n", + "```python\n", + "self.llm = LLMBackend(input=self.input_memory)\n", + "```\n", + "\n", + "In our `LLMBackend` Class, we leverage prompt caching on input tokens and uses them for subsequent requests.\n", + "\n", + "```python\n", + "class LLMBackend:\n", + " def __init__(self, model_name: str, input: str, cache_ttl: int = 60):\n", + " self.cache = caching.CachedContent.create(\n", + " model=model_name,\n", + " display_name='due_knowledge_base', # used to identify the cache\n", + " system_instruction=(\n", + " self.compose_prompt(input, conversation_config)\n", + " ),\n", + " ttl=datetime.timedelta(minutes=cache_ttl),\n", + " )\n", + "\n", + " self.model = genai.GenerativeModel.from_cached_content(cached_content=self.cache)\n", + "```\n", + "\n", + "**Quiz Generation**\n", + "\n", + "Coming back to our `Client` class, we implement the `quiz()` method to generate a quiz based on the full input memory, i.e. the initial knowledge base and any additional user added content.\n", + "\n", + "The `quiz()` method returns a `Quiz` instance which behind the scenes caches input tokens. The user later can invoke its `generate()` method to generate a quiz passing the user instructions in `msg` parameter, as we will see later.\n", + "\n", + "```python\n", + " def quiz(self, add_citations: bool = True, num_questions: int = 10) -> str:\n", + " \"\"\"\n", + " Returns a quiz instance based on full input memory.\n", + " \"\"\"\n", + " self.quiz_instance = Quiz(\n", + " input=self.input_memory,\n", + " add_citations=add_citations,\n", + " num_questions=num_questions)\n", + " return self.quiz_instance\n", + "```\n", + "\n", + "We write a simple prompt template for quiz generation:\n", + "\n", + "> ROLE:\n", + "> - You are a Harvard Professor providing a quiz.\n", + "> INSTRUCTIONS:\n", + "> - Generate a quiz with {num_questions} questions based on the input.\n", + "> - The quiz should be multi-choice.\n", + "> - Answers should be provided at the end of the quiz.\n", + "> - Questions should have broad coverage of the input including multiple Input IDs.\n", + "> - Level of difficulty is advanced/hard.\n", + "> - {{citations}}\n", + ">\n", + "> STRUCTURE:\n", + "> - Sequence of questions and alternatives.\n", + "> - At the end provide the correct answers.\n", + "\n", + "where, `{citations}` instructs the model to add CiC citations to the response if user requests it." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Example Usage\n", + "\n", + "\n", + "**Dataset**\n", + "\n", + "First, we will define our knowledge base. \n", + "\n", + "- Harvard Class: [GOV 1039 Syllabus](https://scholar.harvard.edu/files/dlcammack/files/gov_1039_syllabus.pdf)\n", + "- Class / Topic: \"Rights\"\n", + "- Reading List:\n", + " - ID 1. The Declaration of Independence of the United States of America\n", + " - ID 2. The United States Bill of Rights\n", + " - ID 3. John F. Kennedy's Inaugural Address\n", + " - ID 4. Lincoln's Gettysburg Address\n", + " - ID 5. The United States Constitution\n", + " - ID 6. Give Me Liberty or Give Me Death\n", + " - ID 7. The Mayflower Compact\n", + " - ID 8. Abraham Lincoln's Second Inaugural Address\n", + " - ID 9. Abraham Lincoln's First Inaugural Address\n", + "\n", + "We will take advantage of Project Gutenberg's to create our knowledge base." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "kb = [f\"https://www.gutenberg.org/cache/epub/{i}/pg{i}.txt\" for i in range(1,9)]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We will import our module `gemini_duo` as `genai_duo` and initialize the `Client` class with our knowledge base." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import gemini_duo as genai_duo\n", + "from IPython.display import Markdown, display" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "duo = genai_duo.Client(knowledge_base=kb)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "At this point, we converted each book into markdown using MarkitDown and cached the content in our LLM model. We can access how many tokens we have cached in our LLM model by looking at the `usage_metadata` attribute of the Gemini's model response. At this point, we have cached at total of 38470 tokens.\n", + "\n", + "Now, we can add references to our knowledge base at anytime by calling the `add()` method. We add the following references:\n", + "1. The Magna Carta\n", + "2. William Shap McKechnie on Magna Carta book" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "study_references = [\"https://www.gutenberg.org/cache/epub/10000/pg10000.txt\", \"https://www.gutenberg.org/cache/epub/65363/pg65363.txt\"]\n", + "\n", + "duo.add(study_references)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we can instantiate a `Quiz` object and generate a quiz based on the full input memory." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "quiz = duo.quiz(add_citations=True)\n", + "display(Markdown(quiz.generate()))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "{numref}`quiz` shows a sample quiz with citations. Marked in yellow are the citations which refer to the input IDs of the resources we added to the model.\n", + "\n", + "```{figure} ../_static/input/quiz.png\n", + "---\n", + "name: quiz\n", + "alt: Quiz with Citations\n", + "scale: 50%\n", + "align: center\n", + "---\n", + "Sample Quiz with Citations.\n", + "```\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Discussion\n", + "\n", + "The experiment demonstrated the ability to build a knowledge base from multiple sources while leveraging prompt caching for efficiency and generate quizzes with citations for verifiability. The system successfully ingested content from Project Gutenberg texts, including historical documents like the Magna Carta, and used them to create interactive educational content.\n", + "\n", + "However, several limitations emerged during this process:\n", + "\n", + "1. Memory Management: The system currently loads all content into memory, which could become problematic with larger knowledge bases. A more scalable approach might involve chunking or streaming the content.\n", + "\n", + "2. Citation Quality: While the system provides citations, they lack specificity - pointing to entire documents rather than specific passages or page numbers. This limits the ability to fact-check or verify specific claims.\n", + "\n", + "3. Content Verification: While citations are provided, the system is not guaranteed to provide factual information. This could lead to potential hallucinations or misinterpretations.\n", + "\n", + "While limitations are present in this simple example, the case study highlights that not always complex systems are needed. Alternative simple strategies should be preferred when possible, particularly if capable, long-context window models are available and fit within the application requirements.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Conclusion" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "[![CC BY-NC-SA 4.0][cc-by-nc-sa-image]][cc-by-nc-sa]\n", + "\n", + "[cc-by-nc-sa]: http://creativecommons.org/licenses/by-nc-sa/4.0/\n", + "[cc-by-nc-sa-image]: https://licensebuttons.net/l/by-nc-sa/4.0/88x31.png\n", + "[cc-by-nc-sa-shield]: https://img.shields.io/badge/License-CC-BY--NC--SA-4.0-lightgrey.svg\n", + "\n", + "```\n", + "@misc{tharsistpsouza2024tamingllms,\n", + " author = {Tharsis T. P. Souza},\n", + " title = {Taming LLMs: A Practical Guide to LLM Pitfalls with Open Source Software},\n", + " year = {2024},\n", + " chapter = {Managing Input Data},\n", + " journal = {GitHub repository},\n", + " url = {https://github.com/souzatharsis/tamingLLMs)\n", + "}\n", + "```\n", + "## References\n", + "```{bibliography}\n", + ":filter: docname in docnames\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.11" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git a/tamingllms/_build/jupyter_execute/notebooks/local.ipynb b/tamingllms/_build/jupyter_execute/notebooks/local.ipynb index 1c42990..a95dcb4 100644 --- a/tamingllms/_build/jupyter_execute/notebooks/local.ipynb +++ b/tamingllms/_build/jupyter_execute/notebooks/local.ipynb @@ -181,11 +181,11 @@ "Performance Comparison including proprietary models.\n", "```\n", "\n", - "Also from China, DeepSeek-V3 {cite}`deepseek2024v3` represents a major breakthrough in open source language models, emerging as arguably as the most capable open source large language model available today. With 671 billion parameters and 37 billion active MoE (Mixture of Experts) parameters, it achieves performance on par with leading proprietary models like Claude 3.5 Sonnet and GPT 4o as shown in {numref}`deep`. The model demonstrates impressive efficiency metrics (see {numref}`deep2`), processing input tokens at $0.27 per million and output tokens at $1.1 per million, while maintaining a generation speed of 60 tokens per second (3x faster than DeepSeek-V2).\n", + "Also from China, DeepSeek-V3 {cite}`deepseek2024v3` represents a major breakthrough in open source language models, emerging as arguably the most capable open source large language model available as of the end of 2024. With 671 billion parameters and 37 billion active MoE (Mixture of Experts) parameters, it achieves performance on par with leading proprietary models like Claude 3.5 Sonnet and GPT 4o as shown in {numref}`deep`. The model demonstrates impressive cost efficiency metrics (see {numref}`deep2`), processing input tokens at $0.27 per million and output tokens at $1.1 per million, while maintaining a generation speed of 60 tokens per second (3x faster than DeepSeek-V2).\n", "\n", - "What makes DeepSeek-V3 particularly remarkable is that these capabilities were achieved with a relatively modest training budget of just $5.5 million, used to train on 14.8 trillion tokens. This efficiency in training demonstrates the potential for open source models to compete with proprietary alternatives at a fraction of the cost. The model's release marks a significant milestone in the democratization of advanced AI capabilities, challenging the dominance of proprietary models.\n", + "What makes DeepSeek-V3 particularly remarkable is that these capabilities were achieved with a relatively modest training budget of just $5.5 million, used to train on 14.8 trillion tokens. This efficiency in training demonstrates the potential for open source models to compete with proprietary alternatives at a fraction of the cost. The model's release marks a significant milestone in the democratization of advanced AI capabilities, challenging the dominance of proprietary models within big tech. One should be cautious though as the model has not yet been battle-tested in the wild but this is an exciting development demonstrating the potential of open source models to compete with proprietary alternatives.\n", "\n", - "```{figure} ../_static/local/deep.png\n", + "```{figure} ../_static/local/deep.jpeg\n", "---\n", "name: deep\n", "alt: DeepSeek-V3\n", @@ -195,7 +195,7 @@ "DeepSeek-V3 Performance Comparison\n", "```\n", "\n", - "```{figure} ../_static/local/deep2.png\n", + "```{figure} ../_static/local/deep2.jpeg\n", "---\n", "name: deep2\n", "alt: DeepSeek-V3 Cost Benefit Analysis\n", diff --git a/tamingllms/_static/input/anth_contextual.png b/tamingllms/_static/input/anth_contextual.png new file mode 100644 index 0000000..c8401c0 Binary files /dev/null and b/tamingllms/_static/input/anth_contextual.png differ diff --git a/tamingllms/_static/input/asset_class.png b/tamingllms/_static/input/asset_class.png new file mode 100644 index 0000000..237d081 Binary files /dev/null and b/tamingllms/_static/input/asset_class.png differ diff --git a/tamingllms/_static/input/docling.png b/tamingllms/_static/input/docling.png new file mode 100644 index 0000000..143ded9 Binary files /dev/null and b/tamingllms/_static/input/docling.png differ diff --git a/tamingllms/_static/input/markitdown.png b/tamingllms/_static/input/markitdown.png new file mode 100644 index 0000000..282503c Binary files /dev/null and b/tamingllms/_static/input/markitdown.png differ diff --git a/tamingllms/_toc.yml b/tamingllms/_toc.yml index fa8fb2f..74069fe 100644 --- a/tamingllms/_toc.yml +++ b/tamingllms/_toc.yml @@ -10,7 +10,7 @@ chapters: - file: markdown/intro.md - file: notebooks/evals.ipynb - file: notebooks/structured_output.ipynb -#- file: notebooks/input.ipynb +- file: notebooks/input.ipynb - file: notebooks/safety.ipynb - file: notebooks/alignment.ipynb - file: notebooks/local.ipynb diff --git a/tamingllms/notebooks/input.ipynb b/tamingllms/notebooks/input.ipynb index c323ec5..d05669d 100644 --- a/tamingllms/notebooks/input.ipynb +++ b/tamingllms/notebooks/input.ipynb @@ -21,10 +21,22 @@ "source": [ "## Introduction\n", "\n", + "Large Language Models face several critical challenges in effectively processing input data. While advances in long-context language models (LCLMs) {cite}`lee2024longcontextlanguagemodelssubsume` have expanded the amount of information these systems can process simultaneously, significant challenges remain in managing and effectively utilizing extended inputs. \n", "\n", + "LLMs are sensitive to input formatting and structure, requiring careful data preparation to achieve optimal results {cite}`tan2024htmlraghtmlbetterplain`. They operate with knowledge cutoffs, providing potentially stale or outdated information that may not reflect current reality and demonstrate problems with temporal knowledge accuracy {cite}`amayuelas-etal-2024-knowledge`. LLMs also struggle with less common but important information showing a systematic loss of long-tail knowledge {cite}`kotha2024understanding`.\n", "\n", + "Motivated by these challenges, this chapter explores two key components:\n", "\n", - "When building applications with language models, developers often default to complex architectures involving retrieval systems, chunking strategies, and sophisticated pipelines. However, these approaches add unnecessary complexity when simpler solutions exist. This is where long-context language models (LCLMs) {cite}`lee2024longcontextlanguagemodelssubsume` come in. LCLMs are a new class of models that can process massive amounts of text - up to millions of tokens - in a single forward pass. This capability means they can directly ingest and reason about entire documents or datasets without requiring external tools or complex preprocessing steps. The implications are significant: developers can build more maintainable systems by simply feeding raw text to the model rather than orchestrating complicated retrieval and chunking pipelines. Recent benchmarks have shown that this straightforward approach can match or exceed the performance of more complex systems like RAG, despite never being explicitly trained for such tasks. Before implementing sophisticated architectures, developers should first evaluate whether an LCLM's native capabilities might offer a simpler path to their goals." + "1. Data Parsing: Parsing documents into a unified format that is suitable for LLMs to process.\n", + "2. Retrieval Augmentation: Augmenting LLMs with the ability to retrieve relevant, recent, and specialized information.\n", + "\n", + "In data parsing, we will explore some useful open source tools that help transform data into LLM-compatible formats, demonstrating their impact through a case study of structured information extraction from complex PDFs. In a second case study, we will introduce some chunking strategies to help LLMs process long inputs and implement a particular technique called Chunking with Contextual Linking the enables contextually relevant chunk processing.\n", + "\n", + "In retrieval augmentation, we will explore how to enhance LLMs with semantic search capabilities for incorporating external context using RAGs (Retrieval Augmented Generation). Through a detailed case study, we build a RAG system for querying live codebases, illustrating methods to bridge static model knowledge with dynamic information requirements.\n", + "\n", + "In our last case study, we build a quiz generator using a LLM with large context window. We will explore some additional relevant techniques such as prompt caching and response verification through citations.\n", + "\n", + "By the chapter's conclusion, readers will possess relevant knowledge of input data management strategies for LLMs and practical expertise in selecting and implementing appropriate approaches and tools for specific use cases." ] }, { @@ -33,9 +45,9 @@ "source": [ "## Parsing Documents\n", "\n", - "When discussing document processing with LLMs, there's often a focus on sophisticated algorithms from chunking to contextual inferencing to RAGs. However, this misses the core challenge in production systems, which is 80% about cleaning and normalizing the input, and 20% about actually algorithmic inferencing.\n", + "Building robust data ingestion and preprocessing pipelines is essential for any LLM application. This section explores tools and frameworks that streamline input data processing, in particular for parsing purposes, providing a unified interface for converting diverse data formats into standardized representations that LLMs can effectively process. By abstracting away format-specific complexities, they allow developers to focus on core application logic rather than parsing implementation details while maximizing the performance of the LLM.\n", "\n", - "Building robust data ingestion and preprocessing pipelines is essential for any LLM application. This section explores powerful tools and frameworks like MarkItDown, Docling, and LangChain that streamline document processing. These tools provide unified interfaces for converting diverse document formats into standardized representations that LLMs can effectively process. By abstracting away format-specific complexities, they allow developers to focus on core application logic rather than parsing implementation details.\n" + "We will cover open source tools and frameworks that provide parsing capabilities for a wide range of data formats. And we will demonstrate how some of these tools can be used to extract structured information from complex PDFs also discussing how the quality of the parser can impact LLM's performance." ] }, { @@ -44,7 +56,7 @@ "source": [ "### MarkItDown\n", "\n", - "MarkItDown is a Python package and CLI too developed by the Microsoft AutoGen team for converting various file formats to Markdown. It supports a wide range of formats including PDF, PowerPoint, Word, Excel, images (with OCR and EXIF metadata), audio (with transcription), HTML, and other text-based formats. The tool is particularly useful for document indexing and text analysis tasks.\n", + "MarkItDown is a Python package and CLI too developed by the Microsoft AutoGen team for converting various file formats to Markdown. It supports a wide range of formats including PDF, PowerPoint, Word, Excel, images (with OCR and EXIF metadata), audio (with transcription), HTML, and other text-based formats making it a useful tool for document indexing and LLM-based applications.\n", "\n", "Key features:\n", "- Simple command-line and Python API interfaces\n", @@ -95,14 +107,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Case Study: Structured Data Extraction" + "### Structured Data Extraction" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "A common use case where document parsing matters is to extract structured data from documents, particularly in the presence of complex formatting and layout. In this case study, we will extract the economic forecasts from Merrill Lynch's CIO Capital Market Outlook released on December 16, 2024 {cite:p}`merrill2024`. {numref}`forecast` shows page 7 of the mentioned document, which contains several economic variables. \n", + "A common use case where document parsing matters is to structured data extraction from documents, particularly in the presence of complex formatting and layout. In this case study, we will extract the economic forecasts from Merrill Lynch's CIO Capital Market Outlook released on December 16, 2024 {cite:p}`merrill2024`. We will focus on page 7 of this document, which contains several economic variables organized in a mix of tables, text and images (see {numref}`forecast`)\n", "\n", "\n", "```{figure} ../data/input/forecast.png\n", @@ -116,13 +128,6 @@ "```" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We will focus on the page containing the economic forecasts." - ] - }, { "cell_type": "code", "execution_count": 76, @@ -265,141 +270,35 @@ }, { "cell_type": "code", - "execution_count": 85, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/markdown": [ - "## MARKETS IN REVIEW\n", - "\n", - "## Equities\n", - "\n", - "| | Total Return in USD (%) | Total Return in USD (%) | Total Return in USD (%) | Total Return in USD (%) |\n", - "|-----------------------|---------------------------|---------------------------|---------------------------|---------------------------|\n", - "| | Current | WTD | MTD | YTD |\n", - "| DJIA | 43,828.06 | -1.8 | -2.3 | 18.4 |\n", - "| NASDAQ | 19,926.72 | 0.4 | 3.7 | 33.7 |\n", - "| S&P 500 | 6,051.09 | -0.6 | 0.4 | 28.6 |\n", - "| S&P 400 Mid Cap | 3,277.20 | -1.6 | -2.6 | 19.5 |\n", - "| Russell 2000 | 2,346.90 | -2.5 | -3.5 | 17.3 |\n", - "| MSCI World | 3,817.24 | -1.0 | 0.2 | 22.1 |\n", - "| MSCI EAFE | 2,319.05 | -1.5 | 0.2 | 6.4 |\n", - "| MSCI Emerging Markets | 1,107.01 | 0.3 | 2.7 | 10.6 |\n", - "\n", - "## Fixed Income †\n", - "\n", - "| | Total Return in USD (%) | Total Return in USD (%) | Total Return in USD (%) | Total Return in USD (%) |\n", - "|------------------------------|---------------------------|---------------------------|---------------------------|---------------------------|\n", - "| | Current | WTD | MTD | YTD |\n", - "| Corporate & Government | 4.66 | -1.34 | -0.92 | 1.94 |\n", - "| Agencies | 4.54 | -0.58 | -0.31 | 3.35 |\n", - "| Municipals | 3.55 | -0.87 | -0.54 | 1.99 |\n", - "| U.S. Investment Grade Credit | 4.79 | -1.38 | -0.93 | 1.97 |\n", - "| International | 5.17 | -1.40 | -0.90 | 3.20 |\n", - "| High Yield | 7.19 | -0.22 | 0.20 | 8.87 |\n", - "| 90 Day Yield | 4.32 | 4.39 | 4.49 | 5.33 |\n", - "| 2 Year Yield | 4.24 | 4.10 | 4.15 | 4.25 |\n", - "| 10 Year Yield | 4.40 | 4.15 | 4.17 | 3.88 |\n", - "| 30 Year Yield | 4.60 | 4.34 | 4.36 | 4.03 |\n", - "\n", - "## Commodities & Currencies\n", - "\n", - "| | Total Return in USD (%) | Total Return in USD (%) | Total Return in USD (%) | Total Return in USD (%) |\n", - "|-----------------------|---------------------------|---------------------------|---------------------------|---------------------------|\n", - "| Commodities | Current | WTD | MTD | YTD |\n", - "| Bloomberg Commodity | 237.90 | 1.3 | 0.7 | 5.1 |\n", - "| WTI Crude $/Barrel †† | 71.29 | 6.1 | 4.8 | -0.5 |\n", - "| Gold Spot $/Ounce †† | 2648.23 | 0.6 | 0.2 | 28.4 |\n", - "\n", - "## Total Return in USD (%)\n", - "\n", - "| Currencies | Current | Prior Week End | Prior Month End | 2022 Year End |\n", - "|--------------|-----------|--------------------|---------------------|-------------------|\n", - "| EUR/USD | 1.05 | 1.06 | 1.06 | 1.1 |\n", - "| USD/JPY | 153.65 | 150 | 149.77 | 141.04 |\n", - "| USD/CNH | 7.28 | 7.28 | 7.25 | 7.13 |\n", - "\n", - "## S&P Sector Returns\n", - "\n", - "\n", - "\n", - "Sources: Bloomberg, Factset. Total Returns from the period of 12/9/2024 to 12/13/2024. †Bloomberg Barclays Indices. ††Spot price returns. All data as of the 12/13/2024 close. Data would differ if a different time period was displayed. Short-term performance shown to illustrate more recent trend. Past performance is no guarantee\n", - "\n", - "of future results.\n", - "\n", - "## Economic Forecasts (as of 12/13/2024)\n", - "\n", - "| | Q4 2024E | 2024E | Q1 2025E | Q2 2025E | Q3 2025E | Q4 2025E | 2025E |\n", - "|------------------------------------|------------|---------|------------|------------|------------|------------|---------|\n", - "| Real global GDP (% y/y annualized) | - | 3.1 | - | - | - | - | 3.2 |\n", - "| Real U.S. GDP (% q/q annualized) | 2.0 | 2.7 | 2.5 | 2.3 | 2.2 | 2.2 | 2.4 |\n", - "| CPI inflation (% y/y) | 2.7 | 2.9 | 2.3 | 2.3 | 2.7 | 2.5 | 2.5 |\n", - "| Core CPI inflation (% y/y) | 3.3 | 3.4 | 3.0 | 2.9 | 3.2 | 3.1 | 3 |\n", - "| Unemployment rate (%) | 4.2 | 4 | 4.3 | 4.3 | 4.4 | 4.4 | 4.3 |\n", - "| Fed funds rate, end period (%) | 4.38 | 4.38 | 4.13 | 3.88 | 3.88 | 3.88 | 3.88 |\n", - "\n", - "The forecasts in the table above are the base line view from BofA Global Research. The Global Wealth & Investment Management (GWIM) Investment Strategy Committee (ISC) may make adjustments to this view over the course of the year and can express upside/downside to these forecasts. Historical data is sourced from Bloomberg, FactSet, and\n", - "\n", - "Haver Analytics. There can be no assurance that the forecasts will be achieved. Economic or financial forecasts are inherently limited and should not be relied on as indicators of future investment performance.\n", - "\n", - "A = Actual. E/* = Estimate.\n", - "\n", - "Sources: BofA Global Research; GWIM ISC as of December 13, 2024.\n", - "\n", - "## Asset Class Weightings (as of 12/3/2024)\n", - "\n", - "| | CIO View | CIO View | CIO View | CIO View | CIO View |\n", - "|----------------------------------------|------------------------------|-------------|------------|------------|------------|\n", - "| Asset Class | Underweight | Underweight | Neutral | Overweight | Overweight |\n", - "| Global Equities | slight over weight green  |  |  | |  |\n", - "| U.S. Large Cap Growth |  |  | |  |  |\n", - "| U.S. Large Cap Value | Slight over weight green  |  |  | |  |\n", - "| U.S. Small Cap Growth | slight over weight green  |  |  | |  |\n", - "| U.S. Small Cap Value | slight over weight green  |  |  | |  |\n", - "| International Developed | Slight underweight orange  | |  |  |  |\n", - "| Emerging Markets |  |  | |  |  |\n", - "| Global Fixed Income | slight underweight orange  | |  |  |  |\n", - "| U.S. Governments | slight over weight green  |  |  | |  |\n", - "| U.S. Mortgages | Slight over weight green  |  |  | |  |\n", - "| U.S. Corporates | Slight underweight orange  | |  |  |  |\n", - "| International Fixed Income |  |  | |  |  |\n", - "| High Yield | Slight underweight orange  | |  |  |  |\n", - "| U.S. Investment-grade | Neutral yellow  |  | |  |  |\n", - "| Tax Exempt U.S. High Yield Tax Exempt | Slight underweight orange  | |  |  |  |\n", - "| Cash | | | | | |\n", - "\n", - "## CIO Equity Sector Views\n", - "\n", - "| | CIO View | CIO View | CIO View | CIO View | CIO View |\n", - "|-------------------------|------------------------------|-------------|------------|------------|------------|\n", - "| Sector | | Underweight | Neutral | | Overweight |\n", - "| Utilities | slight over weight green  |  |  | |  |\n", - "| Financials | slight over weight green  |  |  | |  |\n", - "| Healthcare | slight over weight green  |  |  | |  |\n", - "| Consumer Discretionary | Slight over weight green  |  |  | |  |\n", - "| Information Technology | Neutral yellow  |  | |  |  |\n", - "| Communication Services | Neutral yellow  |  | |  |  |\n", - "| Industrials | Neutral yellow  |  | |  |  |\n", - "| Real Estate | Neutral yellow  |  | |  |  |\n", - "| Energy | slight underweight orange  | |  |  |  |\n", - "| Materials | slight underweight orange  | |  |  |  |\n", - "| Consumer Staples | underweight red |  |  |  |  |\n", - "\n", - "CIO asset class views are relative to the CIO Strategic Asset Allocation (SAA) of a multi-asset portfolio. Source: Chief Investment Office as of December 3, 2024. All sector and asset allocation recommendations must be considered in the context of an individual investor's goals, time horizon, liquidity needs and risk tolerance. Not all recommendations will be in the best interest of all investors." - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "display(Markdown(forecast_result_docling))" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "{numref}`docling` shows part of the parsed result from Docling." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```{figure} ../_static/input/docling.png\n", + "---\n", + "name: docling\n", + "alt: Docling's result\n", + "scale: 60%\n", + "align: center\n", + "---\n", + "Docling's parsed result\n", + "```\n" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -409,85 +308,9 @@ }, { "cell_type": "code", - "execution_count": 96, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/markdown": [ - "Economic Forecasts (as of 12/13/2024)\n", - "\n", - "Real global GDP (% y/y annualized)\n", - "Real U.S. GDP (% q/q annualized)\n", - "CPI inflation (% y/y)\n", - "Core CPI inflation (% y/y)\n", - "Unemployment rate (%)\n", - "Fed funds rate, end period (%)\n", - "\n", - "Q4 2024E\n", - "-\n", - "2.0\n", - "2.7\n", - "3.3\n", - "4.2\n", - "4.38\n", - "\n", - "2024E\n", - "3.1\n", - "2.7\n", - "2.9\n", - "3.4\n", - "4.0\n", - "4.38\n", - "\n", - "Q1 2025E Q2 2025E Q3 2025E Q4 2025E\n", - "\n", - "-\n", - "2.5\n", - "2.3\n", - "3.0\n", - "4.3\n", - "4.13\n", - "\n", - "-\n", - "2.3\n", - "2.3\n", - "2.9\n", - "4.3\n", - "3.88\n", - "\n", - "-\n", - "2.2\n", - "2.7\n", - "3.2\n", - "4.4\n", - "3.88\n", - "\n", - "-\n", - "2.2\n", - "2.5\n", - "3.1\n", - "4.4\n", - "3.88\n", - "\n", - "2025E\n", - "3.2\n", - "2.4\n", - "2.5\n", - "3.0\n", - "4.3\n", - "3.88\n", - "\n", - "The forecasts in the table above are the base line view f" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "from IPython.display import display, Markdown\n", "display(Markdown(forecast_result_md[:500]))" @@ -497,13 +320,30 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now, let's focus on the economic forecasts. In particular, we are interested in the CIO's 2025E forecasts.\n", + "{numref}`markitdown` shows part of the parsed result from MarkItDown.\n", "\n", - "```{figure} ../data/input/2025.png\n", + "```{figure} ../_static/input/markitdown.png\n", + "---\n", + "name: markitdown\n", + "alt: MarkItDown's parsed result\n", + "scale: 60%\n", + "align: center\n", + "---\n", + "MarkItDown's parsed result\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, let's focus on the economic forecasts. In particular, we are interested in extracting the CIO's 2025E forecasts.\n", + "\n", + "```{figure} ../_static/input/2025.png\n", "---\n", "name: forecast2025\n", "alt: Forecast 2025\n", - "scale: 60%\n", + "scale: 45%\n", "align: center\n", "---\n", "Forecast 2025\n", @@ -530,7 +370,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We write a simple function to extract the economic forecasts from the document using an LLM model (with structured output) using the following prompt template, where `extract_prompt` is kind of data the user would like to extract and `doc` is the input document to analyze." + "We write a simple function to extract the economic forecasts from the document using an LLM model (with structured output) with the following prompt template, where `extract_prompt` is kind of data the user would like to extract and `doc` is the input document to analyze." ] }, { @@ -623,7 +463,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The response is a `EconForecast` object containing a list of `Forecast` objects. We can then convert the response to a pandas DataFrame for easier comparison." + "The response is an `EconForecast` object containing a list of `Forecast` objects, as defined in the pydantic model. We can then convert the response to a pandas DataFrame for easier comparison." ] }, { @@ -830,19 +670,19 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The results from both MarkItDown and Docling are identical and accurately match the true values from the document. This demonstrates that despite MarkItDown's output appearing less readable from a human perspective, both approaches successfully extracted the economic forecast data with equal precision. The formatting differences between the two methods did not impact their ability to capture and structure the underlying information at least in this particular case." + "The results from MarkItDown and Docling are identical and accurately match the true values from the document. This demonstrates that despite MarkItDown's output appearing less readable from a human perspective, both approaches enabled the LLM to successfully extract the economic forecast data with equal accuracy, in this particular case." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Now, let's focus on the asset class weightings. We will extract the asset class weightings from the document and compare the results from MarkItDown and Docling. The information now is presented in a quite different structure. The CIO view is represented in a spectrum from \"Underweight\", passing through \"Neutral\" to \"Overweight\". And the actual view is marked by some colored dots. Let's see if we can extract the information from the document.\n", - "```{figure} ../data/input/asset_class.png\n", + "Now, let's focus on the asset class weightings. We will extract the asset class weightings from the document and compare the results from MarkItDown and Docling. The information now is presented in a quite different structure. The CIO view information is represented in a spectrum from starting with \"Underweight\", passing through \"Neutral\" and reaching \"Overweight\". The actual view is marked by some colored dots in the chart. Let's see if we can extract this information from the document.\n", + "```{figure} ../_static/input/asset_class.png\n", "---\n", "name: asset_class\n", "alt: Asset Class Weightings\n", - "scale: 60%\n", + "scale: 50%\n", "align: center\n", "---\n", "Asset Class Weightings\n", @@ -884,7 +724,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now we construct a DataFrame to compare the results from MarkItDown and Docling with an added \"true_value\" column containing the true values from the document." + "Now we construct a DataFrame to compare the results from MarkItDown and Docling with an added \"true_value\" column containing the true values from the document, which we extracted manually from the chart." ] }, { @@ -1091,7 +931,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Docling performs significantly better at 93.33% accuracy missing only one value. MarkItDown achieves 53.33% accuracy, struggling with nuanced asset class weightings. In this case, Docling's structured parsed output did help the LLM to extract the information more accurately compared to MarkItDown's unstructured output." + "Docling performs significantly better at 93.33% accuracy missing only one value. MarkItDown achieves 53.33% accuracy, struggling with nuanced asset class weightings. In this case, Docling's structured parsed output did help the LLM to extract the information more accurately compared to MarkItDown's unstructured output. Hence, in this case, the strategy used to parse the data did impact the LLM's ability to extract the information. A more robust analysis would run data extraction on a large sample data a number of repeated runs to estimate error rates." ] }, { @@ -1100,8 +940,8 @@ "source": [ "What if we want to systematically extract all tables from the document? We can use Docling to do that by simply accessing the `tables` attribute of the `DocumentConverter` object.\n", "\n", - "We observe that Docling extracted 7 tables from the document. Exporting tables from top down left to right in order of appearance.\n", - "We can see the first table successfully extracted for Equities forecasts, the second one for Fixed Income forecasts. We also display the last table, which contains CIO Equity Sector Views.\n" + "By doing that, we observe that Docling extracted 7 tables from the document. Exporting tables from top down and left to right in order of appearance in the document.\n", + "Below, we can see the first table successfully extracted for Equities forecasts, the second one for Fixed Income forecasts as well as the last table, which contains CIO Equity Sector Views.\n" ] }, { @@ -1663,7 +1503,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Coming back to MarkItDown, one interesting feature to explore is the ability to extract information from images by passing an image capable LLM model." + "Coming back to MarkItDown, one interesting feature to explore is the ability to extract information from images by passing an image capable LLM model to its constructor." ] }, { @@ -1688,13 +1528,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Here's the description we obtain from the image of our input document. Overall, the description is somewhat accurate but contains a few inaccuracies including:\n", - "\n", - "- For the sector weightings, the description states there are \"underweight positions in U.S. Small Cap Growth\" but looking at the Asset Class Weightings chart, U.S. Small Cap Growth actually shows an overweight position (green circle).\n", - "- The description mentions \"overweight positions in certain sectors such as Utilities and Financials\" but looking at the CIO Equity Sector Views, both these sectors show neutral positions, not overweight positions.\n", - "- For fixed income, the description cites a \"10-Year (4.03%)\" yield, but the image shows the 30-Year Yield at 4.03%, while the 10-Year Yield is actually 4.40%.\n", - "\n", - "Arguably, the description's inaccuracies could be a consequence of the underlying LLM model's inability to process the image. Further research is needed to determine if this is the case.\n" + "Here's the description we obtain from the image of our input document." ] }, { @@ -1742,15 +1576,30 @@ "display(Markdown(result.text_content))" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "\n", + "Overall, the description is somewhat accurate but contains a few inaccuracies including:\n", + "\n", + "- For the sector weightings, the description states there are \"underweight positions in U.S. Small Cap Growth\" but looking at the Asset Class Weightings chart, U.S. Small Cap Growth actually shows an overweight position (green circle).\n", + "- The description mentions \"overweight positions in certain sectors such as Utilities and Financials\" but looking at the CIO Equity Sector Views, both these sectors show neutral positions, not overweight positions.\n", + "- For fixed income, the description cites a \"10-Year (4.03%)\" yield, but the image shows the 30-Year Yield at 4.03%, while the 10-Year Yield is actually 4.40%.\n", + "\n", + "Arguably, the description's inaccuracies could be a consequence of the underlying LLM model's inability to process the image. Further research is needed to determine if this is the case." + ] + }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Retrieval-Augmented Generation\n", "\n", - "RAG is a technique that allows LLMs to retrieve information from a knowledge base to answer questions. It is a popular technique for building LLM applications that require knowledge-intensive tasks.\n", + "RAG is a technique that allows LLMs to retrieve information from a knowledge base to answer questions. It is a popular technique for building LLM applications that require knowledge-intensive tasks {cite}`lewis2021retrievalaugmentedgenerationknowledgeintensivenlp`.\n", "\n", - "{cite}`lewis2021retrievalaugmentedgenerationknowledgeintensivenlp`" + "RAG utilizes a retrieval system to fetch external knowledge and augment the LLM. It has proved effective in mitigating hallucinations of LLMs {cite}`10.1145/3589334.3645481, ni-etal-2024-llms`." ] }, { @@ -1774,12 +1623,12 @@ "source": [ "### Case Study I: Content Chunking with Contextual Linking\n", "\n", - "Content chunking with contextual linking is a technique used to manage the `max_output_tokens` limitation by breaking down long-form content into smaller, manageable chunks while keeping chunk-specific context. This approach tackles three problems:\n", + "Content chunking with contextual linking is a technique to break down long-form content into smaller, manageable chunks while keeping chunk-specific context. This approach tackles three problems:\n", "1. The LLM's inability to process long inputs to do context-size limits\n", "2. The LLM's inability to generate long-form content due to the `max_output_tokens` limitation.\n", "3. The LLM's inability to maintain coherence and context when generating responses per chunks\n", "\n", - "The following steps are followed to implement content chunking with contextual linking:\n", + "Here, we exemplify this technique by following these steps:\n", "1. **Chunking the Content**: The input content is split into smaller chunks. This allows the LLM to process each chunk individually, focusing on generating a complete and detailed response for that specific section of the input.\n", "\n", "2. **Maintaining Context**: Each chunk is linked with contextual information from the previous chunks. This helps in maintaining the flow and coherence of the content across multiple chunks.\n", @@ -2105,7 +1954,7 @@ }, { "cell_type": "code", - "execution_count": 105, + "execution_count": 109, "metadata": {}, "outputs": [ { @@ -2131,72 +1980,10 @@ "\n", "**2.2 Operational Risks:**\n", "\n", - "Several operational risks could significantly impact Apple's performance:\n", - "\n", - "* **Employee Retention:** Competition for highly skilled employees, particularly in Silicon Valley, poses a significant risk. Failure to retain key personnel or maintain its distinctive culture could negatively affect innovation, product development, and overall operational efficiency. **Market-moving insight:** Any significant changes in employee turnover rates or negative press regarding Apple's workplace culture could negatively impact investor sentiment.\n", - "\n", - "* **Reseller Dependence:** Apple's reliance on carriers, wholesalers, and retailers for product distribution introduces risks related to their financial health, distribution decisions, and potential changes in financing or subsidy programs. **Market-moving insight:** Monitoring the financial performance of key resellers and any changes in their distribution strategies is crucial.\n", - "\n", - "* **Information Technology and Cybersecurity:** Apple's dependence on complex IT systems makes it vulnerable to system failures, network disruptions, and cybersecurity threats (including ransomware attacks). These events could disrupt operations, damage reputation, and impact sales. The Form 10-K highlights the company's proactive measures, but acknowledges that these may not be sufficient to prevent all incidents. **Market-moving insight:** Any major cybersecurity breach or significant service outage could trigger a negative market reaction.\n", - "\n", - "**2.3 Legal and Regulatory Risks:**\n", - "\n", - "Apple faces significant legal and regulatory challenges:\n", - "\n", - "* **Antitrust Litigation:** The ongoing antitrust lawsuits in the U.S. and investigations in Europe concerning App Store practices pose a substantial risk. Adverse outcomes could result in significant fines, changes to business practices, and reputational damage. **Market-moving insight:** The progress and outcomes of these legal proceedings will be closely watched by the market. Any negative developments could significantly impact Apple's stock price.\n", - "\n", - "* **Digital Markets Act (DMA) Compliance:** Apple's efforts to comply with the DMA in the EU introduce uncertainty and potential costs. Non-compliance could lead to substantial fines. **Market-moving insight:** The Commission's ongoing investigations and any subsequent decisions will be closely monitored.\n", - "\n", - "* **Data Privacy and Protection:** Increasingly stringent data privacy regulations worldwide impose significant compliance costs and risks. Non-compliance could result in penalties and reputational harm. **Market-moving insight:** Any significant fines or negative publicity related to data privacy violations could negatively impact Apple's stock price.\n", - "\n", - "* **Other Legal Proceedings:** The Form 10-K notes that Apple is subject to various other legal proceedings, the outcomes of which are uncertain and could materially affect its financial condition.\n", - "\n", - "**2.4 Financial Risks:**\n", - "\n", - "Several financial risks could impact Apple's performance:\n", - "\n", - "* **Sales and Profit Margin Volatility:** Apple's quarterly net sales and profit margins are subject to fluctuations due to various factors, including pricing pressures, competition, product life cycles, supply chain issues, and macroeconomic conditions. **Market-moving insight:** Any significant deviation from expected sales or profit margins could trigger market reactions.\n", - "\n", - "* **Foreign Exchange Rate Fluctuations:** Apple's international operations expose it to risks associated with changes in the value of the U.S. dollar. Fluctuations in exchange rates can impact sales, earnings, and gross margins. **Market-moving insight:** Significant movements in major currency exchange rates relative to the USD should be monitored for their potential impact on Apple's financial results.\n", - "\n", - "* **Credit Risk and Investment Portfolio:** Apple's exposure to credit risk on trade receivables and fluctuations in the value of its investment portfolio could lead to losses. **Market-moving insight:** Any significant deterioration in the creditworthiness of key customers or a substantial decline in the value of Apple's investment portfolio could be viewed negatively by the market.\n", - "\n", - "* **Tax Risks:** Changes in tax rates, new tax legislation, and tax audits could materially affect Apple's financial performance.\n", "\n", " (...) \n", "\n", - " **4.10 Debt and Share Repurchases:**\n", - "\n", - "Note 9 details Apple's debt structure, including commercial paper and term debt. While the company has a strong credit rating, the significant amount of debt and the high weighted-average interest rate on commercial paper (5.00% in 2024) indicate potential interest rate risk. Note 10 highlights the substantial share repurchase program ($95 billion in 2024), which, while returning value to shareholders, could limit funds available for future investments or acquisitions. **Market-moving insight:** Investors will monitor the balance between debt levels, share repurchases, and investments in future growth.\n", - "\n", - "**4.11 Share-Based Compensation:**\n", - "\n", - "Note 11 shows a steady increase in share-based compensation expense, reflecting Apple's reliance on equity-based incentives to attract and retain talent. The significant unrecognized compensation cost related to outstanding RSUs ($19.4 billion in 2024) represents a future expense commitment. **Market-moving insight:** Changes in share-based compensation policies or unexpected increases in expense could impact future profitability.\n", - "\n", - "**4.12 Commitments and Supply Concentrations:**\n", - "\n", - "Note 12 reveals Apple's substantial unconditional purchase obligations, primarily for suppliers, licensed intellectual property, and content. These commitments represent significant future cash outflows and highlight the company's dependence on its supply chain. **Market-moving insight:** Any disruptions in the supply chain or changes in supplier relationships could negatively impact Apple's production and sales.\n", - "\n", - "\n", - "This detailed analysis reveals several key risk factors and market-moving insights beyond those identified in Part 3. Investors and analysts should carefully consider these factors when assessing Apple's future performance and valuation.\n", - "\n", - "**PART 5: Contingencies, Supply Chain, and Segment Analysis**\n", - "\n", - "This section analyzes additional information from Apple Inc.'s 2024 Form 10-K, focusing on contingencies, supply chain risks, and a deeper dive into segment performance.\n", - "\n", - "**5.1 Contingencies and Legal Proceedings:**\n", - "\n", - "The Form 10-K acknowledges that Apple is involved in various legal proceedings and claims. While management believes no material loss is reasonably possible beyond existing accruals, the inherent uncertainty of litigation remains a risk. Adverse outcomes in any of these cases could negatively impact Apple's financial condition and reputation. **Market-moving insight:** Any significant legal developments or settlements should be closely monitored for their potential market impact. Increased legal expenses or negative publicity could affect investor sentiment.\n", - "\n", - "**5.2 Supply Chain Concentration:**\n", - "\n", - "Apple's reliance on a concentrated network of outsourcing partners, primarily located in a few Asian countries, presents significant risks. The dependence on single or limited sources for certain custom components exposes Apple to supply chain disruptions, shortages, and price fluctuations. While Apple uses multiple sources for most components, the unique nature of some components used in new products creates vulnerability. Suppliers might prioritize common components over custom ones, impacting Apple's ability to produce its innovative products. **Market-moving insight:** Any significant supply chain disruptions, geopolitical instability in key manufacturing regions, or changes in supplier relationships could negatively impact Apple's production and sales, triggering a negative market reaction.\n", - "\n", - "**5.3 Detailed Segment Analysis:**\n", - "\n", - "Note 13 provides a detailed breakdown of Apple's segment performance. While the Americas and Europe showed growth, primarily driven by Services revenue, Greater China experienced a decline due to lower iPhone and iPad sales and currency headwinds. This highlights the regional economic and currency risks impacting Apple's revenue. The relatively flat year-over-year iPhone sales, despite growth in other product lines, warrants further investigation into market saturation and competitive pressures. The significant contribution of the Services segment to overall revenue and profitability underscores both its importance and the risk associated with its dependence on this segment.\n", - "\n", - "The reconciliation of segment operating income to consolidated operating income reveals that research and development (R&D) and other corporate expenses significantly impact overall profitability. While increased R&D is generally positive, it reduces short-term profits. The geographical breakdown of net sales and long-lived assets further emphasizes the concentration of Apple's business in the U.S. and China. **Market-moving insight:** Continued weakness in the Greater China market, sustained flat iPhone sales, or any significant changes in R&D spending should be closely monitored for their potential impact on Apple's financial performance and investor sentiment.\n", + " The reconciliation of segment operating income to consolidated operating income reveals that research and development (R&D) and other corporate expenses significantly impact overall profitability. While increased R&D is generally positive, it reduces short-term profits. The geographical breakdown of net sales and long-lived assets further emphasizes the concentration of Apple's business in the U.S. and China. **Market-moving insight:** Continued weakness in the Greater China market, sustained flat iPhone sales, or any significant changes in R&D spending should be closely monitored for their potential impact on Apple's financial performance and investor sentiment.\n", "\n", "\n", "**5.4 Auditor's Report and Internal Controls:**\n", @@ -2232,10 +2019,10 @@ " \n", "from IPython.display import Markdown\n", "\n", - "# Display first and last 25% of the report content\n", + "# Display first and last 10% of the report content\n", "report_lines = report_content.splitlines()\n", "total_lines = len(report_lines)\n", - "quarter_lines = total_lines // 4\n", + "quarter_lines = total_lines // 10\n", "\n", "top_portion = '\\n'.join(report_lines[:quarter_lines])\n", "bottom_portion = '\\n'.join(report_lines[-quarter_lines:])\n", @@ -2266,7 +2053,18 @@ "\n", "- **Depth of Analysis**: While the report covers a wide range of topics, the depth of analysis in certain sections may not be as comprehensive as a human expert's evaluation. Some nuances and contextual factors might be overlooked by the LLM. Splitting the report into multiple parts helps in mitigating this issue.\n", "\n", - "- **Chunking Strategy**: The current approach splits the text into chunks based on size, which ensures that each chunk fits within the model's token limit. However, this method may disrupt the logical flow of the document, as sections of interest might be split across multiple chunks. An alternative approach could be \"structured\" chunking, where the text is divided based on meaningful sections or topics. This would preserve the coherence of each section, making it easier to follow and understand. Implementing structured chunking requires additional preprocessing to identify and segment the text appropriately, but it can significantly enhance the readability and logical flow of the generated report.\n" + "- **Chunking Strategy**: The current approach splits the text into chunks based on size, which ensures that each chunk fits within the model's token limit. However, this method may disrupt the logical flow of the document, as sections of interest might be split across multiple chunks. An alternative approach could be \"structured\" chunking, where the text is divided based on meaningful sections or topics. This would preserve the coherence of each section, making it easier to follow and understand. Implementing structured chunking requires additional preprocessing to identify and segment the text appropriately, but it can significantly enhance the readability and logical flow of the generated report.\n", + "\n", + "Here, we implemented a simple strategy to improve the coherence in output generation given a multi-part chunked input. Many other strategies are possible. One related technique worth mentioning is Anthropic's Contextual Retrieval {cite}`anthropic2024contextualretrieval`. The approach, as shown in {numref}`anth_contextual`, employs an LLM itself to generate relevant context per chunk before passing these two pieces of information together to the LLM. This process was proposed in the context of RAGs to enhance its retrieval capabilities but can be applied more generally to improve output generation.\n", + "```{figure} ../_static/input/anth_contextual.png\n", + "---\n", + "name: anth_contextual\n", + "alt: Anthropic Contextual Linking\n", + "scale: 50%\n", + "align: center\n", + "---\n", + "Anthropic Contextual Linking {cite}`anthropic2024contextualretrieval`.\n", + "```" ] }, { @@ -2336,7 +2134,7 @@ "\n", "**Corpus-in-Context Prompting**\n", "\n", - "The `add()` method is key since it is used to add content to the client. It takes a list of URLs and extracts the content from each URL using a content extractor, which we used MarkitDown. The content is then added to the conversation input in a way that enables citations using the \"Corpus-in-Context\" (CIC) Prompting {cite}`lee2024longcontextlanguagemodelssubsume`.\n", + "The `add()` method is key since it is used to add content to the client. It takes a list of URLs and extracts the content from each URL using a content extractor (using MarkitDown). The content is then added to the conversation input memory in a way that enables citations using the \"Corpus-in-Context\" (CIC) Prompting {cite}`lee2024longcontextlanguagemodelssubsume`.\n", "\n", "{numref}`cic` shows how CIC format is used to enable citations. It inserts a corpus into the prompt. Each candidate citable part (e.g., passage, chapter) in a corpus is assigned a unique identifier (ID) that can be referenced as needed for that task.\n", "\n", @@ -2376,7 +2174,7 @@ "```\n", "\n", "\n", - "Later, when the user sends a message to the client, the `msg()` method is used to generate a response while enabling citations. `self.content_generator` is an instance of our LLM model, which we will next.\n", + "Later, when the user sends a message to the client, the `msg()` method is used to generate a response while enabling citations. `self.content_generator` is an instance of our LLM model, which we will go through next.\n", "\n", "```python\n", " def msg(self, msg: str = \"\", add_citations: bool = False) -> str:\n", @@ -2397,9 +2195,7 @@ "\n", "LLM-based applications often involve repeatedly passing the same input tokens to a model, which can be inefficient and costly. Context caching addresses this by allowing you to cache input tokens after their first use and reference them in subsequent requests. This approach significantly reduces costs compared to repeatedly sending the same token corpus, especially at scale.\n", "\n", - "Context caching proves especially valuable when a large initial context needs to be referenced multiple times by smaller requests. By caching the context upfront, these applications can maintain high performance while optimizing token usage and associated costs.\n", - "\n", - "In our application, the user might pass a large knowledge base to the client that can be referenced multiple times by smaller user requests. Our `Client` class is composed of a `LLMBackend` class that takes the `input_memory` - containing the entire knowledge base and any additional user added content.\n", + "In our application, the user might passes a large knowledge base to the client that can be referenced multiple times by smaller user requests. Our `Client` class is composed of a `LLMBackend` class that takes the `input_memory` containing the entire knowledge base and any additional user added content.\n", "```python\n", "self.llm = LLMBackend(input=self.input_memory)\n", "```\n", @@ -2425,7 +2221,7 @@ "\n", "Coming back to our `Client` class, we implement the `quiz()` method to generate a quiz based on the full input memory, i.e. the initial knowledge base and any additional user added content.\n", "\n", - "The `quiz()` method returns a `Quiz` instance which behind the scenes caches input tokens. The user later can invoke the `generate()` method to generate a quiz passing the user instructions in `msg` parameter, as we will see later.\n", + "The `quiz()` method returns a `Quiz` instance which behind the scenes caches input tokens. The user later can invoke its `generate()` method to generate a quiz passing the user instructions in `msg` parameter, as we will see later.\n", "\n", "```python\n", " def quiz(self, add_citations: bool = True, num_questions: int = 10) -> str:\n", @@ -2449,7 +2245,8 @@ "> - Answers should be provided at the end of the quiz.\n", "> - Questions should have broad coverage of the input including multiple Input IDs.\n", "> - Level of difficulty is advanced/hard.\n", - "> - {citations}\n", + "> - {{citations}}\n", + ">\n", "> STRUCTURE:\n", "> - Sequence of questions and alternatives.\n", "> - At the end provide the correct answers.\n", @@ -2497,7 +2294,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We will import our module as `genai_duo` and initialize the `Client` class with our knowledge base." + "We will import our module `gemini_duo` as `genai_duo` and initialize the `Client` class with our knowledge base." ] }, { @@ -2562,7 +2359,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "{numref}`quiz` shows a sample sample quiz with citations. Marked in yellow are the citations which refer to the input IDs of the resources we added to the model.\n", + "{numref}`quiz` shows a sample quiz with citations. Marked in yellow are the citations which refer to the input IDs of the resources we added to the model.\n", "\n", "```{figure} ../_static/input/quiz.png\n", "---\n", @@ -2581,24 +2378,17 @@ "source": [ "#### Discussion\n", "\n", - "The experiment demonstrated the ability to build a knowledge base from multiple sources and generate quizzes with citations. The system successfully ingested content from Project Gutenberg texts, including historical documents like the Magna Carta, and used them to create interactive educational content.\n", + "The experiment demonstrated the ability to build a knowledge base from multiple sources while leveraging prompt caching for efficiency and generate quizzes with citations for verifiability. The system successfully ingested content from Project Gutenberg texts, including historical documents like the Magna Carta, and used them to create interactive educational content.\n", "\n", "However, several limitations emerged during this process:\n", "\n", "1. Memory Management: The system currently loads all content into memory, which could become problematic with larger knowledge bases. A more scalable approach might involve chunking or streaming the content.\n", "\n", - "2. Context Window Constraints: With 38,470 tokens cached, we are approaching typical context window limits of many LLMs. This restricts how much knowledge can be referenced simultaneously during generation.\n", - "\n", - "3. Citation Quality: While the system provides citations, they lack specificity - pointing to entire documents rather than specific passages or page numbers. This limits the ability to fact-check or verify specific claims.\n", - "\n", - "4. Content Verification: The system does not currently verify the accuracy of generated quiz questions against the source material. This could lead to potential hallucinations or misinterpretations.\n", - "\n", - "5. Input Format Limitations: The current implementation works well with plain text but may struggle with more complex document formats or structured data sources.\n", - "\n", - "These limitations highlight opportunities for future improvements in knowledge management and citation systems when building LLM-powered educational tools.\n", + "2. Citation Quality: While the system provides citations, they lack specificity - pointing to entire documents rather than specific passages or page numbers. This limits the ability to fact-check or verify specific claims.\n", "\n", + "3. Content Verification: While citations are provided, the system is not guaranteed to provide factual information. This could lead to potential hallucinations or misinterpretations.\n", "\n", - "Citation Granularity: While citations are provided, currently they are given at the resource level rather than specific passages." + "While limitations are present in this simple example, the case study highlights that not always complex systems are needed. Alternative simple strategies should be preferred when possible, particularly if capable, long-context window models are available and fit within the application requirements.\n" ] }, { diff --git a/tamingllms/notebooks/local.ipynb b/tamingllms/notebooks/local.ipynb index fa1f01e..7a717ce 100644 --- a/tamingllms/notebooks/local.ipynb +++ b/tamingllms/notebooks/local.ipynb @@ -181,11 +181,11 @@ "Performance Comparison including proprietary models.\n", "```\n", "\n", - "Also from China, DeepSeek-V3 {cite}`deepseek2024v3` represents a major breakthrough in open source language models, emerging as arguably as the most capable open source large language model available today. With 671 billion parameters and 37 billion active MoE (Mixture of Experts) parameters, it achieves performance on par with leading proprietary models like Claude 3.5 Sonnet and GPT 4o as shown in {numref}`deep`. The model demonstrates impressive efficiency metrics (see {numref}`deep2`), processing input tokens at $0.27 per million and output tokens at $1.1 per million, while maintaining a generation speed of 60 tokens per second (3x faster than DeepSeek-V2).\n", + "Also from China, DeepSeek-V3 {cite}`deepseek2024v3` represents a major breakthrough in open source language models, emerging as arguably the most capable open source large language model available as of the end of 2024. With 671 billion parameters and 37 billion active MoE (Mixture of Experts) parameters, it achieves performance on par with leading proprietary models like Claude 3.5 Sonnet and GPT 4o as shown in {numref}`deep`. The model demonstrates impressive cost efficiency metrics (see {numref}`deep2`), processing input tokens at $0.27 per million and output tokens at $1.1 per million, while maintaining a generation speed of 60 tokens per second (3x faster than DeepSeek-V2).\n", "\n", - "What makes DeepSeek-V3 particularly remarkable is that these capabilities were achieved with a relatively modest training budget of just $5.5 million, used to train on 14.8 trillion tokens. This efficiency in training demonstrates the potential for open source models to compete with proprietary alternatives at a fraction of the cost. The model's release marks a significant milestone in the democratization of advanced AI capabilities, challenging the dominance of proprietary models.\n", + "What makes DeepSeek-V3 particularly remarkable is that these capabilities were achieved with a relatively modest training budget of just $5.5 million, used to train on 14.8 trillion tokens. This efficiency in training demonstrates the potential for open source models to compete with proprietary alternatives at a fraction of the cost. The model's release marks a significant milestone in the democratization of advanced AI capabilities, challenging the dominance of proprietary models within big tech. One should be cautious though as the model has not yet been battle-tested in the wild but this is an exciting development demonstrating the potential of open source models to compete with proprietary alternatives.\n", "\n", - "```{figure} ../_static/local/deep.png\n", + "```{figure} ../_static/local/deep.jpeg\n", "---\n", "name: deep\n", "alt: DeepSeek-V3\n", @@ -195,7 +195,7 @@ "DeepSeek-V3 Performance Comparison\n", "```\n", "\n", - "```{figure} ../_static/local/deep2.png\n", + "```{figure} ../_static/local/deep2.jpeg\n", "---\n", "name: deep2\n", "alt: DeepSeek-V3 Cost Benefit Analysis\n", diff --git a/tamingllms/references.bib b/tamingllms/references.bib index 1c7b988..bd754d5 100644 --- a/tamingllms/references.bib +++ b/tamingllms/references.bib @@ -782,7 +782,7 @@ @misc{lewis2021retrievalaugmentedgenerationknowledgeintensivenlp @misc{deepseek2024v3, title={DeepSeek-V3 Technical Report}, - author={DeepSeek AI}, + author={DeepSeek}, year={2024}, howpublished={Technical Report}, url={https://github.com/deepseek-ai/DeepSeek-V3/blob/main/DeepSeek_V3.pdf} @@ -1186,9 +1186,108 @@ @techreport{ukgov2024airegulation24 url={https://www.gov.uk/government/publications/ai-regulation-a-pro-innovation-approach/white-paper}, } + +@inproceedings{10.1145/3589334.3645481, +author = {Zhou, Yujia and Liu, Zheng and Jin, Jiajie and Nie, Jian-Yun and Dou, Zhicheng}, +title = {Metacognitive Retrieval-Augmented Large Language Models}, +year = {2024}, +isbn = {9798400701719}, +publisher = {Association for Computing Machinery}, +address = {New York, NY, USA}, +url = {https://doi.org/10.1145/3589334.3645481}, +doi = {10.1145/3589334.3645481}, +booktitle = {Proceedings of the ACM Web Conference 2024}, +pages = {1453-1463}, +numpages = {11}, +keywords = {llms, metacognition, retrieval-augmented generation}, +location = {Singapore, Singapore}, +series = {WWW '24} +} + +@misc{tan2024htmlraghtmlbetterplain, + title={HtmlRAG: HTML is Better Than Plain Text for Modeling Retrieved Knowledge in RAG Systems}, + author={Jiejun Tan and Zhicheng Dou and Wen Wang and Mang Wang and Weipeng Chen and Ji-Rong Wen}, + year={2024}, + eprint={2411.02959}, + archivePrefix={arXiv}, + primaryClass={cs.IR}, + url={https://arxiv.org/abs/2411.02959}, +} + +@misc{anthropic2024contextualretrieval, + title={Introducing Contextual Retrieval}, + author={{Anthropic}}, + year={2024}, + month={09}, + url={https://www.anthropic.com/news/contextual-retrieval} +} + + +@article{zhou2024larger, +author = {Zhou, Lexin and Schellaert, Wout and Plumed, Fernando and Moros-Daval, Yael and Ferri, Cesar and Hernández-Orallo, Jose}, +year = {2024}, +month = {09}, +pages = {61-68}, +title = {Larger and more instructable language models become less reliable}, +volume = {634}, +journal = {Nature}, +doi = {10.1038/s41586-024-07930-y} +} + +@inproceedings{amayuelas-etal-2024-knowledge, + title = "Knowledge of Knowledge: Exploring Known-Unknowns Uncertainty with Large Language Models", + author = "Amayuelas, Alfonso and + Wong, Kyle and + Pan, Liangming and + Chen, Wenhu and + Wang, William Yang", + editor = "Ku, Lun-Wei and + Martins, Andre and + Srikumar, Vivek", + booktitle = "Findings of the Association for Computational Linguistics: ACL 2024", + month = aug, + year = "2024", + address = "Bangkok, Thailand", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/2024.findings-acl.383", + doi = "10.18653/v1/2024.findings-acl.383", + pages = "6416--6432", + +} + +@inproceedings{ +kotha2024understanding, +title={Understanding Catastrophic Forgetting in Language Models via Implicit Inference}, +author={Suhas Kotha and Jacob Mitchell Springer and Aditi Raghunathan}, +booktitle={The Twelfth International Conference on Learning Representations}, +year={2024}, +url={https://openreview.net/forum?id=VrHiF2hsrm} +} + + + +@inproceedings{ni-etal-2024-llms, + title = "When Do {LLM}s Need Retrieval Augmentation? Mitigating {LLM}s{'} Overconfidence Helps Retrieval Augmentation", + author = "Ni, Shiyu and + Bi, Keping and + Guo, Jiafeng and + Cheng, Xueqi", + editor = "Ku, Lun-Wei and + Martins, Andre and + Srikumar, Vivek", + booktitle = "Findings of the Association for Computational Linguistics: ACL 2024", + month = aug, + year = "2024", + address = "Bangkok, Thailand", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/2024.findings-acl.675", + doi = "10.18653/v1/2024.findings-acl.675", + pages = "11375--11388", +} + @misc{meta2024llamaguard, title={LlamaGuard: LLM-based Input-Output Safeguard for Human-AI Conversations}, - author={Meta AI}, + author={Meta-AI}, year={2024}, howpublished={Meta AI Research Publications}, url={https://ai.meta.com/research/publications/llama-guard-llm-based-input-output-safeguard-for-human-ai-conversations/},