fix: CUDA error when inferencing with Falcon-40B base model (#992)

898285c9 · Kyujin Cho · GitHub · a62de9ec · 898285c9
Unverified Commit 898285c9 authored Sep 10, 2023 by Kyujin Cho Committed by GitHub Sep 10, 2023
Hide whitespace changes
Inline Side-by-side

Showing with 2 additions and 1 deletion

vllm/config.py vllm/config.py +2 -1

No files found.
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -114,8 +114,9 @@ class ModelConfig:
        # Note: for falcon, when new_decoder_architecture is True, the
        # multi_query flag is ignored and we use n_head_kv for the number of
        # KV heads.
+        falcon_model_types = ["falcon", "RefinedWeb", "RefinedWebModel"]
        new_decoder_arch_falcon = (
-            self.hf_config.model_type == "falcon"
+            self.hf_config.model_type in falcon_model_types
            and getattr(self.hf_config, "new_decoder_architecture", False))
        if not new_decoder_arch_falcon and getattr(self.hf_config,
                                                   "multi_query", False):