support telechat-12b

fa5b0b39 · zhuwenwen · e96edbbe · fa5b0b39 · fa5b0b39 · fa5b0b39
Commit fa5b0b39 authored Oct 25, 2024 by zhuwenwen
3 changed files
--- a/csrc/attention/static_switch.h
+++ b/csrc/attention/static_switch.h
@@ -48,6 +48,9 @@
    } else if (HEADDIM == 128) {           \
      constexpr static int HEAD_SIZE = 128; \
      return __VA_ARGS__();                \
+    } else if (HEADDIM == 160) {           \
+      constexpr static int HEAD_SIZE = 160; \
+      return __VA_ARGS__();                \
    } else if (HEADDIM == 192) {           \
      constexpr static int HEAD_SIZE = 192; \
      return __VA_ARGS__();                \

--- a/csrc/attention/static_switch_tc.h
+++ b/csrc/attention/static_switch_tc.h
@@ -40,6 +40,9 @@
    } else if (HEADDIM == 128) {           \
      constexpr static int HEAD_SIZE = 128; \
      return __VA_ARGS__();                \
+    } else if (HEADDIM == 160) {           \
+      constexpr static int HEAD_SIZE = 160; \
+      return __VA_ARGS__();                \
    } else if (HEADDIM == 256) {           \
      constexpr static int HEAD_SIZE = 256; \
      return __VA_ARGS__();                \

--- a/vllm/model_executor/models/telechat_12B.py
+++ b/vllm/model_executor/models/telechat_12B.py
@@ -45,13 +45,13 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization.base_config import (
    QuantizationConfig)
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import Sampler
+from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
 from vllm.model_executor.layers.vocab_parallel_embedding import (
    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
    default_weight_loader, kv_cache_scales_loader)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import IntermediateTensors, SamplerOutput
+from vllm.sequence import IntermediateTensors
 from vllm.utils import is_hip, print_warning_once

 from .interfaces import SupportsLoRA