fix: mistralrs use auto device map (#31)

Fixes a panic.

fix: mistralrs use auto device map (#31)
Fixes a panic.
46ed649c · Graham King · GitHub · 3ba2b7e9 · 46ed649c · 46ed649c
Commit 46ed649c authored Mar 05, 2025 by Graham King Committed by GitHub Mar 05, 2025
Showing with 24 additions and 11 deletions

launch/dynemo-run/README.md launch/dynemo-run/README.md +7 -8

lib/llm/Cargo.toml lib/llm/Cargo.toml +2 -0

lib/llm/src/engines/mistralrs.rs lib/llm/src/engines/mistralrs.rs +15 -3

No files found.
--- a/launch/dynemo-run/README.md
+++ b/launch/dynemo-run/README.md
@@ -2,16 +2,15 @@
 `dynemo-run` is a tool for exploring the dynemo components.
-## Quickstart
+## Setup
- Install Rust
+Libraries (Ubuntu):
- `cargo install --features mistralrs,cuda --git https://github.com/dynemo-ai/dynemo.git dynemo-run`
+```
- `dynemo-run <GGUF or HF-repo-checkout>`
+apt install -y build-essential libhwloc-dev libudev-dev pkg-config libssl-dev protobuf-compiler python3-dev
+```
-## Install and start pre-requisites
-Rust:
+Install Rust:
-```bash
+```
 curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
 ```

--- a/lib/llm/Cargo.toml
+++ b/lib/llm/Cargo.toml
@@ -128,6 +128,8 @@ llama-cpp-2 = { version = "0.1.86", optional = true }
 tokenizers = { version = "0.21.0", default-features = false, features = [
  "onig",
  "esaxx_fast",
+  # Waiting for release: https://github.com/huggingface/tokenizers/issues/1736
+  # "rustls-tls",
 ] }
 sentencepiece = { version = "0.11.2", optional = true }

--- a/lib/llm/src/engines/mistralrs.rs
+++ b/lib/llm/src/engines/mistralrs.rs
@@ -21,7 +21,7 @@ use async_trait::async_trait;
 use either::Either;
 use indexmap::IndexMap;
 use mistralrs::{
-    Constraint, DefaultSchedulerMethod, Device, DeviceMapMetadata, DeviceMapSetting,
+    AutoDeviceMapParams, Constraint, DefaultSchedulerMethod, Device, DeviceMapSetting,
    GGUFLoaderBuilder, GGUFSpecificConfig, MemoryGpuConfig, MistralRs, MistralRsBuilder,
    ModelDType, NormalLoaderBuilder, NormalRequest, NormalSpecificConfig, PagedAttentionConfig,
    Pipeline, Request, RequestMessage, ResponseOk, SamplingParams, SchedulerConfig, TokenSource,
@@ -41,6 +41,15 @@ use crate::types::openai::chat_completions::OpenAIChatCompletionsStreamingEngine
 /// If user does not provide a max_tokens limit prompt+output to this many
 const DEFAULT_MAX_TOKENS: i32 = 8192;
+/// TODO: tune. Presumably we read it from model's config.json?
+const MAX_SEQ_LEN: usize = 4096;
+// TODO: tune, maybe implement batching.
+const MAX_BATCH_SIZE: usize = 2;
+/// TODO: tune
+const PAGED_ATTENTION_MAX_NUM_SEQS: usize = 5;
 pub async fn make_engine(
    gguf_path: &Path,
 ) -> pipeline_error::Result<OpenAIChatCompletionsStreamingEngine> {
@@ -125,7 +134,10 @@ impl MistralRsEngine {
            &ModelDType::Auto,
            &best_device()?,
            false,
-            DeviceMapSetting::Map(DeviceMapMetadata::dummy()),
+            DeviceMapSetting::Auto(AutoDeviceMapParams::Text {
+                max_seq_len: MAX_SEQ_LEN,
+                max_batch_size: MAX_BATCH_SIZE,
+            }),
            None,
            paged_attention_config,
        )?;
@@ -138,7 +150,7 @@ impl MistralRsEngine {
                }
            };
            SchedulerConfig::PagedAttentionMeta {
-                max_num_seqs: 5,
+                max_num_seqs: PAGED_ATTENTION_MAX_NUM_SEQS,
                config,
            }
        } else {