EricLBuehler · EricLBuehler · May 19, 2024 · May 19, 2024
diff --git a/mistralrs-bench/src/main.rs b/mistralrs-bench/src/main.rs
@@ -310,7 +310,6 @@ fn main() -> anyhow::Result<()> {
         candle_core::utils::with_f16c()
     );
     info!("Sampling method: penalties -> temperature -> topk -> topp -> multinomial");
-    info!("Loading model `{}` on {device:?}...", loader.get_id());
     if use_flash_attn {
         info!("Using flash attention.");
     }

diff --git a/mistralrs-core/src/device_map.rs b/mistralrs-core/src/device_map.rs
@@ -41,11 +41,15 @@ impl DeviceMapMetadata {
             }));
         };
         // How many host (cpu) layers, defaulting to automatically filling the rest.
-        let n_host_layers = self.host_layers.unwrap_or(model_layers - n_device_layers);
+        // If n_device_layers > model_layers, n_host_layers = 0
+        let n_host_layers = self
+            .host_layers
+            .unwrap_or(model_layers.saturating_sub(n_device_layers));
         if n_device_layers + n_host_layers != model_layers {
-            candle_core::bail!("Expected the number of device ({n_device_layers}) and host layers ({n_host_layers}) to sum to the number of model hidden layers ({model_layers})");
+            candle_core::bail!("Expected the number of GPU ({n_device_layers}) and host layers ({n_host_layers}) to sum to the number of model hidden layers ({model_layers})");
         }
-        info!("Using {n_device_layers} layers on device and {n_host_layers} on host.");
+        info!("Model has {model_layers} repeating layers.");
+        info!("Using {n_device_layers} repeating layers on GPU and {n_host_layers} repeating layers on host.");
         let mut combined = vec![device.clone(); n_device_layers];
         // Always put the CPU layers at the end so that we reduce dtoh and htod copies
         combined.extend(vec![Device::Cpu; n_host_layers]);

diff --git a/mistralrs-core/src/pipeline/ggml.rs b/mistralrs-core/src/pipeline/ggml.rs
@@ -228,6 +228,7 @@ impl Loader for GGMLLoader {
         if !mapper.is_dummy() {
             warn!("GGML models do not support device mapping. Device mapping will not work. Please consider using a GGUF model.");
         }
+        info!("Loading model `{}` on {device:?}...", self.get_id());
 
         let mut file = std::fs::File::open(paths.get_weight_filenames().first().unwrap())?;
         let model = ggml_file::Content::read(&mut file, device)

diff --git a/mistralrs-core/src/pipeline/gguf.rs b/mistralrs-core/src/pipeline/gguf.rs
@@ -314,6 +314,11 @@ impl Loader for GGUFLoader {
                 "You are trying to in-situ quantize a GGUF model. This will not do anything."
             );
         }
+        // Otherwise, the device mapper will print it
+        if mapper.is_dummy() {
+            info!("Loading model `{}` on {device:?}...", self.get_id());
+        }
+
         let mut file = std::fs::File::open(paths.get_weight_filenames().first().unwrap())?;
         let model = gguf_file::Content::read(&mut file)
             .map_err(|e| e.with_path(paths.get_weight_filenames().first().unwrap()))?;

diff --git a/mistralrs-core/src/pipeline/normal.rs b/mistralrs-core/src/pipeline/normal.rs
@@ -211,6 +211,10 @@ impl Loader for NormalLoader {
         } else {
             DType::F32
         };
+        // Otherwise, the device mapper will print it
+        if mapper.is_dummy() {
+            info!("Loading model `{}` on {device:?}...", self.get_id());
+        }
 
         info!(
             "Model config: {:?}",

diff --git a/mistralrs-server/src/main.rs b/mistralrs-server/src/main.rs
@@ -90,7 +90,7 @@
    chat_template: Option<String>,

    /// Source of the token for authentication.
    /// Can be in the formats: "literal:<value>", "env:<value>", "path:<value>", "cache" to use a cached token or "none" to use no token.
    /// Defaults to using a cached token.
    #[arg(long, default_value_t = TokenSource::CacheToken, value_parser = parse_token_source)]
    token_source: TokenSource,
@@ -266,7 +266,6 @@
         candle_core::utils::with_f16c()
     );
     info!("Sampling method: penalties -> temperature -> topk -> topp -> multinomial");
-    info!("Loading model `{}` on {device:?}...", loader.get_id());
     if use_flash_attn {
         info!("Using flash attention.");
     }