{ "DeepSeek-Coder-V2-Instruct": { "hidden_size": 5120, "intermediate_size": 12288, "max_position_embeddings": 163840, "model_type": "deepseek_v2", "num_attention_heads": 128, "num_hidden_layers": 60, "num_key_value_heads": 128, "vocab_size": 102400 }, "DeepSeek-R1": { "hidden_size": 7168, "intermediate_size": 18432, "max_position_embeddings": 163840, "model_type": "deepseek_v3", "num_attention_heads": 128, "num_hidden_layers": 61, "num_key_value_heads": 128, "vocab_size": 129280 }, "DeepSeek-V2-Lite-Chat": { "hidden_size": 2048, "intermediate_size": 10944, "max_position_embeddings": 163840, "model_type": "deepseek_v2", "num_attention_heads": 16, "num_hidden_layers": 27, "num_key_value_heads": 16, "vocab_size": 102400 }, "DeepSeek-V3": { "hidden_size": 7168, "intermediate_size": 18432, "max_position_embeddings": 163840, "model_type": "deepseek_v3", "num_attention_heads": 128, "num_hidden_layers": 3, "num_key_value_heads": 128, "vocab_size": 129280 }, "DeepSeek-V3-bf16": { "hidden_size": 7168, "intermediate_size": 18432, "max_position_embeddings": 163840, "model_type": "deepseek_v3", "num_attention_heads": 128, "num_hidden_layers": 61, "num_key_value_heads": 128, "vocab_size": 129280 }, "LLaMA-2-7B-32K": { "hidden_size": 4096, "intermediate_size": 11008, "max_position_embeddings": 32768, "model_type": "llama", "num_attention_heads": 32, "num_hidden_layers": 32, "num_key_value_heads": 32, "vocab_size": 32000 }, "Moonlight-16B-A3B-Instruct": { "hidden_size": 2048, "intermediate_size": 11264, "max_position_embeddings": 8192, "model_type": "deepseek_v3", "num_attention_heads": 16, "num_hidden_layers": 27, "num_key_value_heads": 16, "vocab_size": 163840 }, "Qwen2.5-32B-Instruct": { "hidden_size": 5120, "intermediate_size": 27648, "max_position_embeddings": 32768, "model_type": "qwen2", "num_attention_heads": 40, "num_hidden_layers": 64, "num_key_value_heads": 8, "vocab_size": 152064 }, "Qwen2.5-32B-Instruct-GPTQ-Int4": { "hidden_size": 5120, "intermediate_size": 27648, "max_position_embeddings": 32768, "model_type": "qwen2", "num_attention_heads": 40, "num_hidden_layers": 64, "num_key_value_heads": 8, "vocab_size": 152064 }, "Qwen2.5-7B-Instruct": { "hidden_size": 3584, "intermediate_size": 18944, "max_position_embeddings": 32768, "model_type": "qwen2", "num_attention_heads": 28, "num_hidden_layers": 28, "num_key_value_heads": 4, "vocab_size": 152064 }, "Qwen2.5-7B-Instruct-GPTQ-Int4": { "hidden_size": 3584, "intermediate_size": 18944, "max_position_embeddings": 32768, "model_type": "qwen2", "num_attention_heads": 28, "num_hidden_layers": 28, "num_key_value_heads": 4, "vocab_size": 152064 }, "qwen2-72b-instruct": { "hidden_size": 8192, "intermediate_size": 29568, "max_position_embeddings": 32768, "model_type": "qwen2", "num_attention_heads": 64, "num_hidden_layers": 80, "num_key_value_heads": 8, "vocab_size": 152064 } }