feat(speculative-sampling): allow to specify a draft model in the mod…

…el config (#1052) **Description** This PR fixes #1013. It adds `draft_model` and `n_draft` to the model YAML config in order to load models with speculative sampling. This should be compatible as well with grammars. example: ```yaml backend: llama context_size: 1024 name: my-model-name parameters: model: foo-bar n_draft: 16 draft_model: model-name ``` --------- Signed-off-by: Ettore Di Giacinto <[email protected]>
mudler · Sep 14, 2023 · 8ccf5b2 · 8ccf5b2
1 parent 247d85b
commit 8ccf5b2
Show file tree

Hide file tree

Showing 12 changed files with 485 additions and 427 deletions.
diff --git a/api/backend/options.go b/api/backend/options.go
@@ -42,6 +42,7 @@ func gRPCModelOpts(c config.Config) *pb.ModelOptions {
 		Seed:          int32(c.Seed),
 		NBatch:        int32(b),
 		NoMulMatQ:     c.NoMulMatQ,
+		DraftModel:    c.DraftModel,
 		AudioPath:     c.VallE.AudioPath,
 		LoraAdapter:   c.LoraAdapter,
 		LoraBase:      c.LoraBase,
@@ -79,6 +80,7 @@ func gRPCPredictOpts(c config.Config, modelPath string) *pb.PredictOptions {
 	return &pb.PredictOptions{
 		Temperature:         float32(c.Temperature),
 		TopP:                float32(c.TopP),
+		NDraft:              c.NDraft,
 		TopK:                int32(c.TopK),
 		Tokens:              int32(c.Maxtokens),
 		Threads:             int32(c.Threads),

diff --git a/api/config/config.go b/api/config/config.go
@@ -101,6 +101,8 @@ type LLMConfig struct {
 	LoraAdapter     string   `yaml:"lora_adapter"`
 	LoraBase        string   `yaml:"lora_base"`
 	NoMulMatQ       bool     `yaml:"no_mulmatq"`
+	DraftModel      string   `yaml:"draft_model"`
+	NDraft          int32    `yaml:"n_draft"`
 }
 
 type AutoGPTQ struct {

diff --git a/extra/grpc/autogptq/backend_pb2.py b/extra/grpc/autogptq/backend_pb2.py
diff --git a/extra/grpc/bark/backend_pb2.py b/extra/grpc/bark/backend_pb2.py
diff --git a/extra/grpc/diffusers/backend_pb2.py b/extra/grpc/diffusers/backend_pb2.py
diff --git a/extra/grpc/exllama/backend_pb2.py b/extra/grpc/exllama/backend_pb2.py
diff --git a/extra/grpc/huggingface/backend_pb2.py b/extra/grpc/huggingface/backend_pb2.py
diff --git a/extra/grpc/vall-e-x/backend_pb2.py b/extra/grpc/vall-e-x/backend_pb2.py
diff --git a/extra/grpc/vllm/backend_pb2.py b/extra/grpc/vllm/backend_pb2.py
diff --git a/pkg/backend/llm/llama/llama.go b/pkg/backend/llm/llama/llama.go
@@ -14,7 +14,8 @@ import (
 type LLM struct {
 	base.SingleThread
 
-	llama *llama.LLama
+	llama      *llama.LLama
+	draftModel *llama.LLama
 }
 
 func (llm *LLM) Load(opts *pb.ModelOptions) error {
@@ -78,7 +79,27 @@ func (llm *LLM) Load(opts *pb.ModelOptions) error {
 		llamaOpts = append(llamaOpts, llama.EnabelLowVRAM)
 	}
 
+	if opts.DraftModel != "" {
+		// https://github.com/ggerganov/llama.cpp/blob/71ca2fad7d6c0ef95ef9944fb3a1a843e481f314/examples/speculative/speculative.cpp#L40
+		llamaOpts = append(llamaOpts, llama.SetPerplexity(true))
+	}
+
 	model, err := llama.New(opts.ModelFile, llamaOpts...)
+
+	if opts.DraftModel != "" {
+		// opts.DraftModel is relative to opts.ModelFile, so we need to get the basepath of opts.ModelFile
+		if !filepath.IsAbs(opts.DraftModel) {
+			dir := filepath.Dir(opts.ModelFile)
+			opts.DraftModel = filepath.Join(dir, opts.DraftModel)
+		}
+
+		draftModel, err := llama.New(opts.DraftModel, llamaOpts...)
+		if err != nil {
+			return err
+		}
+		llm.draftModel = draftModel
+	}
+
 	llm.llama = model
 
 	return err
@@ -162,6 +183,9 @@ func buildPredictOptions(opts *pb.PredictOptions) []llama.PredictOption {
 		predictOptions = append(predictOptions, llama.SetSeed(int(opts.Seed)))
 	}
 
+	if opts.NDraft != 0 {
+		predictOptions = append(predictOptions, llama.SetNDraft(int(opts.NDraft)))
+	}
 	//predictOptions = append(predictOptions, llama.SetLogitBias(c.Seed))
 
 	predictOptions = append(predictOptions, llama.SetFrequencyPenalty(opts.FrequencyPenalty))
@@ -175,6 +199,9 @@ func buildPredictOptions(opts *pb.PredictOptions) []llama.PredictOption {
 }
 
 func (llm *LLM) Predict(opts *pb.PredictOptions) (string, error) {
+	if llm.draftModel != nil {
+		return llm.llama.SpeculativeSampling(llm.draftModel, opts.Prompt, buildPredictOptions(opts)...)
+	}
 	return llm.llama.Predict(opts.Prompt, buildPredictOptions(opts)...)
 }
 
@@ -187,7 +214,13 @@ func (llm *LLM) PredictStream(opts *pb.PredictOptions, results chan string) erro
 	}))
 
 	go func() {
-		_, err := llm.llama.Predict(opts.Prompt, predictOptions...)
+		var err error
+		if llm.draftModel != nil {
+			_, err = llm.llama.SpeculativeSampling(llm.draftModel, opts.Prompt, buildPredictOptions(opts)...)
+		} else {
+			_, err = llm.llama.Predict(opts.Prompt, predictOptions...)
+		}
+
 		if err != nil {
 			fmt.Println("err: ", err)
 		}

diff --git a/pkg/grpc/proto/backend.pb.go b/pkg/grpc/proto/backend.pb.go
diff --git a/pkg/grpc/proto/backend.proto b/pkg/grpc/proto/backend.proto
@@ -63,6 +63,7 @@ message PredictOptions {
   float RopeFreqScale = 38;
   float NegativePromptScale = 39;
   string NegativePrompt = 40;
+  int32 NDraft = 41;
 }
 
 // The response message containing the result
@@ -116,7 +117,8 @@ message ModelOptions {
   string LoraBase = 35;
   string LoraAdapter = 36;
   bool NoMulMatQ = 37;
-
+  string DraftModel = 39;
+
   string AudioPath = 38;
 }