index.html

<!DOCTYPE html>
<html>
<head>
  <meta charset="utf-8">
  <!-- Meta tags for social media banners, these should be filled in appropriatly as they are your "business card" -->
  <!-- Replace the content tag with appropriate information -->
  <meta name="description" content="Serving LLMs on an RTX4090 with Sequoia">
  <meta property="og:title" content="Sequoia"/>
  <meta property="og:description" content="Serving Llama2-70B on an RTX4090 with Sequoia"/>
  <meta property="og:url" content="https://dreaming-panda.github.io/Sequoia-Webpage/"/>
  <!-- Path to banner image, should be in the path listed below. Optimal dimenssions are 1200X630-->
  <meta property="og:image" content="static/images/proj_fig.png" />
  <meta property="og:image:width" content="1200"/>
  <meta property="og:image:height" content="630"/>


  <meta name="twitter:title" content="Sequoia">
  <meta name="twitter:description" content="Serving Llama2-70B on an RTX4090 with Sequoia">
  <!-- Path to banner image, should be in the path listed below. Optimal dimenssions are 1200X600-->
  <meta name="twitter:image" content="static/images/proj_fig.png">
  <meta name="twitter:card" content="summary_large_image">
  <!-- Keywords for your paper to be indexed by-->
  <meta name="keywords" content="Speculative Decoding">
  <meta name="viewport" content="width=device-width, initial-scale=1">


  <title>Serving LLMs on an RTX4090 with Sequoia</title>
  <link rel="icon" type="image/x-icon" href="static/images/colorful_sequoia.ico">
  <link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro"
  rel="stylesheet">

  <link rel="stylesheet" href="static/css/bulma.min.css">
  <link rel="stylesheet" href="static/css/bulma-carousel.min.css">
  <link rel="stylesheet" href="static/css/bulma-slider.min.css">
  <link rel="stylesheet" href="static/css/fontawesome.all.min.css">
  <link rel="stylesheet"
  href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css">
  <link rel="stylesheet" href="static/css/index.css">

  <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
  <script src="https://documentcloud.adobe.com/view-sdk/main.js"></script>
  <script defer src="static/js/fontawesome.all.min.js"></script>
  <script src="static/js/bulma-carousel.min.js"></script>
  <script src="static/js/bulma-slider.min.js"></script>
  <script src="static/js/index.js"></script>
</head>
<body>


  <section class="hero">
    <div class="hero-body">
      <div class="container is-max-desktop">
        <div class="columns is-centered">
          <div class="column has-text-centered">
            <h1 class="title is-1 publication-title"><i>SEQUOIA</i>: Serving exact Llama2-70B on an RTX4090 <br>with half-second per token latency</h1>
            <div class="is-size-5 publication-authors">
              <!-- Paper authors -->
              <span class="author-block">
                <a href="https://dreaming-panda.github.io/" target="_blank">Zhuoming Chen</a><sup>*1</sup>,</span>
                <span class="author-block">
                  <a href="https://avnermay.github.io/" target="_blank">Avner May</a><sup>*2</sup>,</span>
                  <span class="author-block">
                    <a href="https://www.linkedin.com/in/ruslansv/" target="_blank">Ruslan Svirschevski</a><sup>*3</sup> 
                  </span> <br>
               <span class="author-block">
                    <a href="https://www.linkedin.com/in/yuhsunhuang/" target="_blank">Yuhsun Huang</a><sup>1</sup>,
                  </span>
              <span class="author-block">
                    <a href="https://mryab.github.io/" target="_blank">Max Ryabinin</a><sup>2</sup>,
                  </span>
              <span class="author-block">
                    <a href="https://www.cs.cmu.edu/~zhihaoj2/" target="_blank"> Zhihao Jia</a><sup>1</sup>,
                  </span>
              <span class="author-block">
                    <a href="https://www.andrew.cmu.edu/user/beidic/" target="_blank">Beidi Chen</a><sup>1,4</sup>
                  </span>
                  </div>
                  <div class="is-size-5 publication-authors">
                    <span class="affliation"><small><sup>1</sup>Carnegie Mellon University <sup>2</sup>Together AI <sup>3</sup>Yandex <sup>4</sup>Meta AI</small></span>
                    <span class="eql-cntrb"><small><br><sup>*</sup>Indicates Equal Contribution</small></span>
                  </div>

                  <div class="column has-text-centered">
                   
                  <!-- Github link -->
                  <span class="link-block">
                    <a href="https://github.com/Infini-AI-Lab/Sequoia/tree/main" target="_blank"
                    class="external-link button is-normal is-rounded is-dark">
                    <span class="icon">
                      <i class="fab fa-github"></i>
                    </span>
                    <span>Code</span>
                  </a>
                </span>

                <!-- ArXiv abstract Link -->
                <span class="link-block">
                  <a href="https://arxiv.org/abs/2402.12374" target="_blank"
                  class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                    <i class="ai ai-arxiv"></i>
                  </span>
                  <span>arXiv</span>
                </a>
              </span>
            </div>
          </div>
        </div>
      </div>
    </div>
  </div>
</section>


<!-- Paper abstract -->
<section class="section hero is-light">
  <div class="container is-max-desktop">
    <div class="columns is-centered has-text-centered">
      <div class="column is-four-fifths">
        <h2 class="title is-3">Introduction</h2>
        <div class="content has-text-justified">
          <p>
            We introduce <i>Sequoia</i>, a scalable, robust and hardware-aware speculative decoding framework that enables serving LLMs (70B, 33B...) with a reasonable latency on consumer GPUs without any approximation (using <b>16bit</b> precision and maintaining the original output distribution). Addressing the problems of robustness and scalability of previous works on speculative decoding, we show below that <i>Sequoia</i>, with a large speculation budget, can serve a <b>Llama2-70B</b> on a single <b>RTX-4090</b> with an average time between tokens (TBT) as low as <b>0.57s</b>, which is <b>8X</b> faster than
            a highly optimized offloading serving system, <b>9X</b> faster than DeepSpeed-Zero Offloading. On a single <b>2080Ti</b> GPU (only 11GB memory), <b>Vicuna-33B</b> can be served with a TBT of <b>0.87s</b>.
          </p>
        </div>
      </div>
    </div>
  </div>
</section>
<!-- End paper abstract -->
  
<!-- Solutions -->
<section class="section hero is-light">
  <div class="container is-max-desktop">
    <div class="columns is-centered has-text-centered">
      <div class="column is-four-fifths">
        <h2 class="title is-3">Serving Solutions by <i>Sequoia</i></h2>
        <div class="content has-text-justified">
           <table>
  <tr>
    <th scope="col">GPU</th>
    <th>Bandwidth(GB/s)</th>
    <th>Target Model</th>
    <th>Draft Model</th>
    <th>TBT(s)</th>
    <th>Baseline(s)</th>
  </tr>
<!--   <tr>
    <th>A5000</th>
    <td>31.5</td>
    <td>Llama2-70B</td>
    <td><a style="color: skyblue" href="https://huggingface.co/meta-llama/Llama-2-7b-chat-hf">Llama2-7B</a></td>
    <td>0.58</td>
    <td>4.59</td>
  </tr>
<tr>
<th>A5000</th>
    <td>31.5</td>
    <td>InternLM-20B</td>
    <td><a style="color: skyblue" href="https://huggingface.co/chargoddard/internlm2-7b-llama">InternLM-7B</a></td>
    <td>0.19</td>
    <td>0.82</td>
  </tr> -->
<tr>
    <th>4090</th>
    <td>31.5</td>
    <td>Llama2-70B</td>
    <td><a style="color: skyblue" href="https://huggingface.co/meta-llama/Llama-2-7b-chat-hf">Llama2-7B</a></td>
    <td>0.57</td>
    <td>4.54</td>
</tr>
<tr>
    <th>4090</th>
    <td>31.5</td>
    <td>Vicuna-33B</td>
    <td><a style="color: skyblue" href="https://huggingface.co/Jiayi-Pan/Tiny-Vicuna-1B">TinyVicuna-1B</a></td>
    <td>0.35</td>
    <td>1.78</td>
</tr>          
<tr>
    <th>4090</th>
    <td>31.5</td>
    <td>Llama2-22B</td>
    <td><a style="color: skyblue" href="TinyLlama/TinyLlama-1.1B-Chat-v1.0">TinyLlama-1.1B</a></td>
    <td>0.17</td>
    <td>0.95</td>
</tr>
<tr>
    <th>4090</th>
    <td>31.5</td>
    <td>InternLM-20B</td>
    <td><a style="color: skyblue" href="https://huggingface.co/chargoddard/internlm2-7b-llama">InternLM-7B</a></td>
    <td>0.17</td>
    <td>0.77</td>
</tr>
<tr>
    <th>4090</th>
    <td>31.5</td>
    <td>Llama2-13B</td>
    <td>TinyLlama-1.1B</td>
    <td>0.09</td>
    <td>0.27</td>
</tr>    
<tr>
    <th>2080Ti</th>
    <td>15.8</td>
    <td>Vicuna-33B</td>
    <td>TinyVicuna-1B</td>
    <td>0.87</td>
    <td>4.81</td>
</tr>

<tr>
    <th>2080Ti</th>
    <td>15.8</td>
    <td>Llama2-22B</td>
    <td>TinyLlama-1.1B</td>
    <td>0.53</td>
    <td>3.04</td>
</tr>
<tr>
    <th>2080Ti</th>
    <td>15.8</td>
    <td>Llama2-13B</td>
    <td>TinyLlama-1.1B</td>
    <td>0.34</td>
    <td>1.53</td>
</tr>

<tr>
    <th>A100-SXM4</th>
    <td>31.5</td>
    <td>Llama3-70B-Instruct</td>
    <td>Llama3-8B-Instruct</td>
    <td>0.38</td>
    <td>2.64</td>
</tr>

<tr>
    <th>A100-SXM4*</th>
    <td>31.5</td>
    <td>Llama3-70B-Instruct (T=0.6)</td>
    <td>Llama3-8B-Instruct</td>
    <td>0.47</td>
    <td>5.30</td>
</tr>

<tr>
    <th>A100-SXM4*</th>
    <td>31.5</td>
    <td>Llama3-70B-Instruct (T=0)</td>
    <td>Llama3-8B-Instruct</td>
    <td>0.47</td>
    <td>5.30</td>
</tr>
           
</table>
          <p>
            <i>Sequoia</i> can speed up LLM inference for a variety of model sizes and types of hardware. We evaluate <i>Sequoia</i> with LLMs of various sizes (including 
            <a style="color: skyblue" href="https://huggingface.co/meta-llama/Llama-2-70b-chat-hf">Llama2-70B-chat</a>, <a style="color: skyblue" href="https://huggingface.co/lmsys/vicuna-33b-v1.3">Vicuna-33B</a>,
            <a style="color: skyblue" href="https://huggingface.co/nkpz/llama2-22b-daydreamer-v3">Llama2-22B</a>, <a style="color: skyblue" href="https://huggingface.co/chargoddard/internlm2-20b-llama">InternLM-20B</a> and 
            <a style="color: skyblue" href="https://huggingface.co/meta-llama/Llama-2-13b-chat-hf">Llama2-13B-chat</a>), on 4090 and 2080Ti, prompted by <a style="color: skyblue" href="https://github.com/lm-sys/FastChat/blob/main/fastchat/llm_judge/data/mt_bench/question.jsonl">MT-Bench</a> with temperature=0.6. 
            The hardware platforms have different GPUs, CPU RAMs and CPU-GPU bandwidth. The evaluation results are listed above. * indicates layer-wise offloading implementations. 
          </p>
          
  
<p>
Here we show a demo for Llama2-70B inference on a single RTX-4090 (with and without <i>Sequoia</i>. Video plays at 4X speed).  
</p>
 <div class="item item-video1">
          <video poster="" id="video1" autoplay controls muted height="100%">
            <!-- Your video file here -->
            <source src="static/videos/Spec_Final4xT.mp4"
            type="video/mp4">
          </video>
  </div>
        </div>
      </div>
    </div>
  </div>
</section>
<!-- End Solutions -->
  
<!-- Why -->
<section class="section hero is-light">
  <div class="container is-max-desktop">
    <div class="columns is-centered has-text-centered">
      <div class="column is-four-fifths">
        <h2 class="title is-3">Why <i>Sequoia</i></h2>
        <div class="content has-text-justified">
          <p>
           Benefiting from two key advantages, <i>Sequoia</i> significantly accelerates LLM serving with offloading. Firstly, <i>Sequoia</i> is more <b>scalable</b> with a large speculation budget. For a given draft / target model pairs, <i>Sequoia</i> leverages a dynamic 
           programming algorithm to search for the optimal tree structure, which enables a much faster growth in terms of accepted tokens with a certain budget (i.e. the size of the speculation tree). Secondly, thanks to sampling without replacement algorithm, <i>Sequoia</i>
           is <b>robust</b> in terms of generating temperatures, compared to top-k sampling and sampling with replacement.
           Apart from offloading, <i>Sequoia</i> provides a hardware-aware solution to adjust the size and depth of speculation trees to adapt to different hardware platforms. <i>Sequoia</i> can also speed up LLM inference on data-center GPUs like A100 and L40, which is discussed in detail in our  <a style="color: skyblue" href="https://arxiv.org/abs/2402.12374" target="_blank">paper</a>.
          </p>
    </div>
     <div class="figure">
  <img
    src="static/images/rssmerge.jpg"
    alt="Robustness and Scalability"
    width="800"
    height="400" />
  <p><b>Left (Scalability):</b> Handcrafted tree structures do not scale well with large speculation budget.<br><b>Right (Robustness):</b> The total acceptance rate of 5 speculation tokens. Sampling with replacement (SpecTr) fails when temperature is low and Top-k sampling fails with high temperature. <i>Sequoia</i>, leveraging sampling without replacement, attains the highest acceptance rate. 
  <br>
  <div class="figure">
  <p><br>Below we show two examples of tree structures in <i>Sequoia</i>. The left one has 64 nodes which is suitable for on-chip inference and the right one has 768 nodes, suitable for offloading settings.
  We append more budget to nodes in previous layers with a higher probability to get accepted.</p>
  <img
    src="static/images/treemerge.jpg"
    alt="Tree Shape"
    width="800"
    height="400" />
</div>
        
    </div>
  </div>
  </div>
</section>
<!-- End Why -->
  
<!-- Discussion -->
<section class="section hero is-light">
  <div class="container is-max-desktop">
    <div class="columns is-centered has-text-centered">
      <div class="column is-four-fifths">
        <h2 class="title is-3">Conclusion and Future Work</h2>
        <div class="content has-text-justified">
          <p>
            Leveraging a large speculation budget, everyone can use RTX 4090 or other consumer (low-cost) GPU, e.g., AMD RX7900 with <i>Sequoia</i> to host very strong LLMs like 70B model without approximation, boosting the applications of AI generated content.
            In addition, we believe <i>Sequoia</i> will perform particularly well on future hardware, because it’s performance scales well with the compute/bandwidth ratio of the hardware, which has been increasing over time (e.g., V100, A100 and H100).
            Moreover, <i>Sequoia</i>, as a speculative decoding framework which mitigates the gap in the memory hierarchy, adapts to any draft/target pairs and any AI accelerators. We will stay tuned with hardware community.
          </p>
        </div>
       <div class="figure">
  <img
    src="static/images/colorful_sequoia.ico"
    alt="<i>Sequoia</i>"
    width="200"
    height="200" />
</div>
      </div>
    </div>
  </div>
</section>
<!-- Disucssion -->
  
<!--BibTex citation -->
  <section class="section" id="BibTeX">
    <div class="container is-max-desktop content">
      <h2 class="title">BibTeX</h2>
      <pre><code>@article{chen2024sequoia,
  title={Sequoia: Scalable, Robust, and Hardware-aware Speculative Decoding},
  author={Chen, Zhuoming and May, Avner and Svirschevski, Ruslan and Huang, Yuhsun and Ryabinin, Max and Jia, Zhihao and Chen, Beidi},
  journal={arXiv preprint arXiv:2402.12374},
  year={2024}
}</code></pre>
    </div>
</section>
<!--End BibTex citation -->


  <footer class="footer">
  <div class="container">
    <div class="columns is-centered">
      <div class="column is-8">
        <div class="content">

          <p>
            This page was built using the <a href="https://github.com/eliahuhorwitz/Academic-project-page-template" target="_blank">Academic Project Page Template</a> which was adopted from the <a href="https://nerfies.github.io" target="_blank">Nerfies</a> project page.
            You are free to borrow the of this website, we just ask that you link back to this page in the footer. <br> This website is licensed under a <a rel="license"  href="http://creativecommons.org/licenses/by-sa/4.0/" target="_blank">Creative
            Commons Attribution-ShareAlike 4.0 International License</a>. The icons are created by GPT4. 
          </p>

        </div>
      </div>
    </div>
  </div>
</footer>

<!-- Statcounter tracking code -->
  
<!-- You can add a tracker to track page visits by creating an account at statcounter.com -->

    <!-- End of Statcounter Code -->

  </body>
  </html>