Unverified Commit 78453792 authored by realliujiaxu's avatar realliujiaxu Committed by GitHub
Browse files

[Bugfix] add support for 'num_attention_groups' in...


[Bugfix] add support for 'num_attention_groups' in ModelArchConfigConvertorBase for Step3p5 (#39796)
Signed-off-by: default avatarrealliujiaxu <realliujiaxu@163.com>
parent 4b7ca37b
...@@ -356,6 +356,23 @@ ...@@ -356,6 +356,23 @@
"is_multimodal_model": false, "is_multimodal_model": false,
"dtype": "torch.float32" "dtype": "torch.float32"
}, },
"stepfun-ai/Step-3.5-Flash": {
"architectures": [
"Step3p5ForCausalLM"
],
"model_type": "step3p5",
"text_model_type": "step3p5",
"hidden_size": 4096,
"total_num_hidden_layers": 45,
"total_num_attention_heads": 64,
"head_size": 128,
"vocab_size": 128896,
"total_num_kv_heads": 8,
"num_experts": 288,
"is_deepseek_mla": false,
"is_multimodal_model": false,
"dtype": "torch.bfloat16"
},
"nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16": { "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16": {
"architectures": [ "architectures": [
"NemotronHForCausalLM" "NemotronHForCausalLM"
......
...@@ -16,6 +16,7 @@ BASE_TRUST_REMOTE_CODE_MODELS = { ...@@ -16,6 +16,7 @@ BASE_TRUST_REMOTE_CODE_MODELS = {
"nvidia/Llama-3_3-Nemotron-Super-49B-v1", "nvidia/Llama-3_3-Nemotron-Super-49B-v1",
"nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16", "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16",
"XiaomiMiMo/MiMo-7B-RL", "XiaomiMiMo/MiMo-7B-RL",
"stepfun-ai/Step-3.5-Flash",
# Excluded: Not available online right now # Excluded: Not available online right now
# "FreedomIntelligence/openPangu-Ultra-MoE-718B-V1.1", # "FreedomIntelligence/openPangu-Ultra-MoE-718B-V1.1",
"meituan-longcat/LongCat-Flash-Chat", "meituan-longcat/LongCat-Flash-Chat",
......
...@@ -77,6 +77,8 @@ class ModelArchConfigConvertorBase: ...@@ -77,6 +77,8 @@ class ModelArchConfigConvertorBase:
"num_key_value_heads", "num_key_value_heads",
# For ChatGLM: # For ChatGLM:
"multi_query_group_num", "multi_query_group_num",
# For Step3p5:
"num_attention_groups",
] ]
# For non-grouped-query attention models, the number of KV heads is # For non-grouped-query attention models, the number of KV heads is
# equal to the number of attention heads. # equal to the number of attention heads.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment