Commit 46ed649c authored by Graham King's avatar Graham King Committed by GitHub
Browse files

fix: mistralrs use auto device map (#31)

Fixes a panic.
parent 3ba2b7e9
...@@ -2,16 +2,15 @@ ...@@ -2,16 +2,15 @@
`dynemo-run` is a tool for exploring the dynemo components. `dynemo-run` is a tool for exploring the dynemo components.
## Quickstart ## Setup
- Install Rust Libraries (Ubuntu):
- `cargo install --features mistralrs,cuda --git https://github.com/dynemo-ai/dynemo.git dynemo-run` ```
- `dynemo-run <GGUF or HF-repo-checkout>` apt install -y build-essential libhwloc-dev libudev-dev pkg-config libssl-dev protobuf-compiler python3-dev
```
## Install and start pre-requisites
Rust: Install Rust:
```bash ```
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
``` ```
......
...@@ -128,6 +128,8 @@ llama-cpp-2 = { version = "0.1.86", optional = true } ...@@ -128,6 +128,8 @@ llama-cpp-2 = { version = "0.1.86", optional = true }
tokenizers = { version = "0.21.0", default-features = false, features = [ tokenizers = { version = "0.21.0", default-features = false, features = [
"onig", "onig",
"esaxx_fast", "esaxx_fast",
# Waiting for release: https://github.com/huggingface/tokenizers/issues/1736
# "rustls-tls",
] } ] }
sentencepiece = { version = "0.11.2", optional = true } sentencepiece = { version = "0.11.2", optional = true }
......
...@@ -21,7 +21,7 @@ use async_trait::async_trait; ...@@ -21,7 +21,7 @@ use async_trait::async_trait;
use either::Either; use either::Either;
use indexmap::IndexMap; use indexmap::IndexMap;
use mistralrs::{ use mistralrs::{
Constraint, DefaultSchedulerMethod, Device, DeviceMapMetadata, DeviceMapSetting, AutoDeviceMapParams, Constraint, DefaultSchedulerMethod, Device, DeviceMapSetting,
GGUFLoaderBuilder, GGUFSpecificConfig, MemoryGpuConfig, MistralRs, MistralRsBuilder, GGUFLoaderBuilder, GGUFSpecificConfig, MemoryGpuConfig, MistralRs, MistralRsBuilder,
ModelDType, NormalLoaderBuilder, NormalRequest, NormalSpecificConfig, PagedAttentionConfig, ModelDType, NormalLoaderBuilder, NormalRequest, NormalSpecificConfig, PagedAttentionConfig,
Pipeline, Request, RequestMessage, ResponseOk, SamplingParams, SchedulerConfig, TokenSource, Pipeline, Request, RequestMessage, ResponseOk, SamplingParams, SchedulerConfig, TokenSource,
...@@ -41,6 +41,15 @@ use crate::types::openai::chat_completions::OpenAIChatCompletionsStreamingEngine ...@@ -41,6 +41,15 @@ use crate::types::openai::chat_completions::OpenAIChatCompletionsStreamingEngine
/// If user does not provide a max_tokens limit prompt+output to this many /// If user does not provide a max_tokens limit prompt+output to this many
const DEFAULT_MAX_TOKENS: i32 = 8192; const DEFAULT_MAX_TOKENS: i32 = 8192;
/// TODO: tune. Presumably we read it from model's config.json?
const MAX_SEQ_LEN: usize = 4096;
// TODO: tune, maybe implement batching.
const MAX_BATCH_SIZE: usize = 2;
/// TODO: tune
const PAGED_ATTENTION_MAX_NUM_SEQS: usize = 5;
pub async fn make_engine( pub async fn make_engine(
gguf_path: &Path, gguf_path: &Path,
) -> pipeline_error::Result<OpenAIChatCompletionsStreamingEngine> { ) -> pipeline_error::Result<OpenAIChatCompletionsStreamingEngine> {
...@@ -125,7 +134,10 @@ impl MistralRsEngine { ...@@ -125,7 +134,10 @@ impl MistralRsEngine {
&ModelDType::Auto, &ModelDType::Auto,
&best_device()?, &best_device()?,
false, false,
DeviceMapSetting::Map(DeviceMapMetadata::dummy()), DeviceMapSetting::Auto(AutoDeviceMapParams::Text {
max_seq_len: MAX_SEQ_LEN,
max_batch_size: MAX_BATCH_SIZE,
}),
None, None,
paged_attention_config, paged_attention_config,
)?; )?;
...@@ -138,7 +150,7 @@ impl MistralRsEngine { ...@@ -138,7 +150,7 @@ impl MistralRsEngine {
} }
}; };
SchedulerConfig::PagedAttentionMeta { SchedulerConfig::PagedAttentionMeta {
max_num_seqs: 5, max_num_seqs: PAGED_ATTENTION_MAX_NUM_SEQS,
config, config,
} }
} else { } else {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment