Unverified Commit 55dcce91 authored by Lu Fang's avatar Lu Fang Committed by GitHub
Browse files

Upstream Llama4 Support to Main (#16113)


Signed-off-by: default avatarAston Zhang <22279212+astonzhang@users.noreply.github.com>
Signed-off-by: default avatarChris Thi <chris.c.thi@gmail.com>
Signed-off-by: default avatardrisspg <drisspguessous@gmail.com>
Signed-off-by: default avatarJon Swenson <jmswen@gmail.com>
Signed-off-by: default avatarKeyun Tong <tongkeyun@gmail.com>
Signed-off-by: default avatarLu Fang <fanglu@meta.com>
Signed-off-by: default avatarXiaodong Wang <xdwang@meta.com>
Signed-off-by: default avatarYang Chen <yangche@fb.com>
Signed-off-by: default avatarYe (Charlotte) Qi <yeq@meta.com>
Signed-off-by: default avatarYong Hoon Shin <yhshin@meta.com>
Signed-off-by: default avatarZijing Liu <liuzijing2014@gmail.com>
Signed-off-by: default avatarLu Fang <lufang@fb.com>
Signed-off-by: default avatarLu Fang <fanglu@fb.com>
Signed-off-by: default avatarLucia Fang <fanglu@fb.com>
Signed-off-by: default avatarRoger Wang <ywang@roblox.com>
Signed-off-by: default avatarDarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: default avatarLu Fang <fanglu@fb.com>
Co-authored-by: default avatarRoger Wang <ywang@roblox.com>
Co-authored-by: default avatarDarkLight1337 <tlleungac@connect.ust.hk>
parent 8017c8db
......@@ -389,7 +389,8 @@ steps:
- pytest -v -s models/test_transformers.py
- pytest -v -s models/test_registry.py
# V1 Test: https://github.com/vllm-project/vllm/issues/14531
- VLLM_USE_V1=0 pytest -v -s models/test_initialization.py
- VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4'
- VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'llama4'
- label: Language Models Test (Standard) # 32min
#mirror_hardwares: [amd]
......
......@@ -553,6 +553,9 @@ def main(args: argparse.Namespace):
intermediate_size = config.moe_intermediate_size
shard_intermediate_size = 2 * intermediate_size // args.tp_size
else:
if not hasattr(config, "hidden_size"):
# Support for llama4
config = config.text_config
# Default: Mixtral.
E = config.num_local_experts
topk = config.num_experts_per_tok
......
......@@ -850,6 +850,13 @@ See [this page](#generative-models) for more information on how to use generativ
*
* ✅︎
* ✅︎
- * `Llama4ForConditionalGeneration`
* Llama-4-17B-Omni-Instruct
* T + I<sup>+</sup>
* `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc.
*
* ✅︎
* ✅︎
- * `LlavaForConditionalGeneration`
* LLaVA-1.5
* T + I<sup>E+</sup>
......
......@@ -47,7 +47,7 @@ def run_minicpmo(question: str, audio_count: int) -> ModelRequestData:
model=model_name,
trust_remote_code=True,
max_model_len=4096,
max_num_seqs=5,
max_num_seqs=2,
limit_mm_per_prompt={"audio": audio_count},
)
......
......@@ -582,6 +582,42 @@ def run_mllama(questions: list[str], modality: str) -> ModelRequestData:
)
def run_llama4(questions: list[str], modality: str):
assert modality == "image"
model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
engine_args = EngineArgs(
model=model_name,
max_model_len=8192,
max_num_seqs=4,
tensor_parallel_size=8,
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
gpu_memory_utilization=0.4,
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
messages = [[{
"role":
"user",
"content": [{
"type": "image"
}, {
"type": "text",
"text": f"{question}"
}]
}] for question in questions]
prompts = tokenizer.apply_chat_template(messages,
add_generation_prompt=True,
tokenize=False)
stop_token_ids = None
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
stop_token_ids=stop_token_ids,
)
# Molmo
def run_molmo(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
......@@ -907,6 +943,7 @@ model_example_map = {
"minicpmv": run_minicpmv,
"mistral3": run_mistral3,
"mllama": run_mllama,
"llama4": run_llama4,
"molmo": run_molmo,
"NVLM_D": run_nvlm_d,
"paligemma": run_paligemma,
......
......@@ -253,6 +253,43 @@ def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
)
def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
engine_args = EngineArgs(
model=model_name,
max_model_len=8192,
max_num_seqs=4,
tensor_parallel_size=8,
limit_mm_per_prompt={"image": len(image_urls)},
)
placeholders = [{"type": "image", "image": url} for url in image_urls]
messages = [{
"role":
"user",
"content": [
*placeholders,
{
"type": "text",
"text": question
},
],
}]
processor = AutoProcessor.from_pretrained(model_name)
prompt = processor.apply_chat_template(messages,
tokenize=False,
add_generation_prompt=True)
return ModelRequestData(
engine_args=engine_args,
prompt=prompt,
image_data=[fetch_image(url) for url in image_urls],
)
def load_mistral3(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
......@@ -567,6 +604,7 @@ model_example_map = {
"h2ovl_chat": load_h2ovl,
"idefics3": load_idefics3,
"internvl_chat": load_internvl,
"llama4": load_llama4,
"mistral3": load_mistral3,
"mllama": load_mllama,
"NVLM_D": load_nvlm_d,
......
......@@ -6,7 +6,7 @@ requests >= 2.26.0
tqdm
blake3
py-cpuinfo
transformers >= 4.50.3
transformers >= 4.51.0
huggingface-hub[hf_xet] >= 0.30.0 # Required for Xet downloads.
tokenizers >= 0.19.1 # Required for Llama 3.
protobuf # Required by LlamaTokenizer.
......
......@@ -30,7 +30,7 @@ mistral_common[opencv] >= 1.5.4 # required for pixtral test
opencv-python-headless >= 4.11.0 # required for video test
datamodel_code_generator # required for minicpm3 test
lm-eval[api]==0.4.8 # required for model evaluation test
transformers==4.50.3
transformers==4.51.0
huggingface-hub[hf_xet]>=0.30.0 # Required for Xet downloads.
# quantization
bitsandbytes>=0.45.3
......
......@@ -645,7 +645,7 @@ tqdm==4.66.6
# transformers
tqdm-multiprocess==0.0.11
# via lm-eval
transformers==4.50.3
transformers==4.51.0
# via
# -r requirements/test.in
# genai-perf
......
......@@ -12,6 +12,7 @@ from vllm.sequence import SampleLogprobs
from ....conftest import HfRunner, VllmRunner
from ....utils import RemoteOpenAIServer
from ...registry import HF_EXAMPLE_MODELS
from ...utils import check_logprobs_close
MODEL_NAME = "fixie-ai/ultravox-v0_5-llama-3_2-1b"
......@@ -55,7 +56,10 @@ def server(request, audio_assets):
for key, value in request.param.items()
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
with RemoteOpenAIServer(MODEL_NAME,
args,
env_dict={"VLLM_AUDIO_FETCH_TIMEOUT":
"30"}) as remote_server:
yield remote_server
......@@ -106,6 +110,10 @@ def run_test(
**kwargs,
):
"""Inference result should be the same between hf and vllm."""
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
model_info.check_available_online(on_fail="skip")
model_info.check_transformers_version(on_fail="skip")
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
......@@ -156,6 +164,10 @@ def run_multi_audio_test(
num_logprobs: int,
**kwargs,
):
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
model_info.check_available_online(on_fail="skip")
model_info.check_transformers_version(on_fail="skip")
with vllm_runner(model,
dtype=dtype,
enforce_eager=True,
......
......@@ -160,17 +160,32 @@ VLM_TEST_SETTINGS = {
),
"aya_vision": VLMTestInfo(
models=["CohereForAI/aya-vision-8b"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
test_type=(VLMTestType.IMAGE),
prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", # noqa: E501
single_image_prompts=IMAGE_ASSETS.prompts({
"stop_sign": "<image>What's the content in the center of the image?", # noqa: E501
"cherry_blossom": "<image>What is the season?", # noqa: E501
}),
multi_image_prompt="<image><image>Describe the two images in detail.", # noqa: E501
max_model_len=8192,
max_model_len=4096,
max_num_seqs=2,
auto_cls=AutoModelForImageTextToText,
vllm_runner_kwargs={"mm_processor_kwargs": {"crop_to_patches": True}},
),
"aya_vision-multi_image": VLMTestInfo(
models=["CohereForAI/aya-vision-8b"],
test_type=(VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", # noqa: E501
single_image_prompts=IMAGE_ASSETS.prompts({
"stop_sign": "<image>What's the content in the center of the image?", # noqa: E501
"cherry_blossom": "<image>What is the season?", # noqa: E501
}),
multi_image_prompt="<image><image>Describe the two images in detail.", # noqa: E501
max_model_len=4096,
max_num_seqs=2,
auto_cls=AutoModelForImageTextToText,
vllm_runner_kwargs={"mm_processor_kwargs": {"crop_to_patches": True}}
vllm_runner_kwargs={"mm_processor_kwargs": {"crop_to_patches": True}},
marks=[large_gpu_mark(min_gb=32)],
),
"blip2": VLMTestInfo(
# TODO: Change back to 2.7b once head_dim = 80 is supported
......@@ -303,6 +318,22 @@ VLM_TEST_SETTINGS = {
use_tokenizer_eos=True,
patch_hf_runner=model_utils.internvl_patch_hf_runner,
),
"llama4": VLMTestInfo(
models=["meta-llama/Llama-4-Scout-17B-16E-Instruct"],
prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|header_start|>user<|header_end|>\n\n{img_prompt}<|eot|><|header_start|>assistant<|header_end|>\n\n", # noqa: E501
img_idx_to_prompt=lambda _: "<|image|>",
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
distributed_executor_backend="mp",
image_size_factors=[(.25, 0.5, 1.0)],
hf_model_kwargs={"device_map": "auto"},
max_model_len=8192,
max_num_seqs=4,
dtype="bfloat16",
auto_cls=AutoModelForImageTextToText,
tensor_parallel_size=8,
vllm_runner_kwargs={"gpu_memory_utilization": 0.8},
marks=multi_gpu_marks(num_gpus=8),
),
"llava_next": VLMTestInfo(
models=["llava-hf/llava-v1.6-mistral-7b-hf"],
test_type=(VLMTestType.IMAGE, VLMTestType.CUSTOM_INPUTS),
......
......@@ -5,7 +5,9 @@ import re
from typing import Optional
import pytest
from packaging.version import Version
from transformers import AutoTokenizer
from transformers import __version__ as TRANSFORMERS_VERSION
from vllm.multimodal.image import rescale_image_size
from vllm.platforms import current_platform
......@@ -81,6 +83,13 @@ def run_test(
from transformers import AutoImageProcessor # noqa: F401
from transformers import AutoProcessor # noqa: F401
# Once the model repo is updated to 4.49, we should be able to run the
# test in `test_models.py` without the above workaround
if Version(TRANSFORMERS_VERSION) >= Version("4.49"):
pytest.skip(f"`transformers=={TRANSFORMERS_VERSION}` installed, "
"but `transformers<=4.49` is required to run this model. "
"Reason: Cannot run HF implementation")
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
......
......@@ -176,6 +176,8 @@ def test_chat(
model,
dtype=dtype,
tokenizer_mode="mistral",
load_format="mistral",
config_format="mistral",
max_model_len=max_model_len,
limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
) as vllm_model:
......
......@@ -257,6 +257,7 @@ def _test_processing_correctness_mistral(
"h2oai/h2ovl-mississippi-800m",
"OpenGVLab/InternVL2-1B",
"HuggingFaceM4/Idefics3-8B-Llama3",
"meta-llama/Llama-4-Scout-17B-16E-Instruct",
"llava-hf/llava-1.5-7b-hf",
"llava-hf/llava-v1.6-mistral-7b-hf",
"llava-hf/LLaVA-NeXT-Video-7B-hf",
......
# SPDX-License-Identifier: Apache-2.0
"""Tests for Llama4's multimodal preprocessing kwargs."""
import pytest
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.transformers_utils.tokenizer import encode_tokens
from ....conftest import _ImageAssets
from ...utils import build_model_context
@pytest.mark.parametrize("model_id",
["meta-llama/Llama-4-Scout-17B-16E-Instruct"])
@pytest.mark.parametrize("mm_processor_kwargs", [{}])
@pytest.mark.parametrize("num_imgs", [1, 5])
@pytest.mark.parametrize("disable_mm_preprocessor_cache", [True, False])
@pytest.mark.parametrize("tokenized_prompt", [True, False])
def test_processor_override(
image_assets: _ImageAssets,
model_id: str,
mm_processor_kwargs: dict,
num_imgs: int,
disable_mm_preprocessor_cache: bool,
tokenized_prompt: bool,
):
"""Ensure llama4 processor works properly."""
ctx = build_model_context(
model_id,
mm_processor_kwargs=mm_processor_kwargs,
limit_mm_per_prompt={"image": num_imgs},
disable_mm_preprocessor_cache=disable_mm_preprocessor_cache,
)
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
config = processor.info.get_hf_config()
tokenizer = processor.info.get_tokenizer()
hf_processor = processor.info.get_hf_processor()
vocab = tokenizer.get_vocab()
prompt = "<|begin_of_text|><|header_start|>user<|header_end|>" \
+ "<|image|>" * num_imgs \
+ "<|eot|><|header_start|>assistant<|header_end|>"
mm_data = {
"image": [
image_assets[(i % len(image_assets))].pil_image
for i in range(num_imgs)
]
}
if tokenized_prompt:
prompt = encode_tokens(tokenizer, prompt)
processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
mm_kwargs = processed_inputs["mm_kwargs"]
# place holder replacements
prompt_token_ids = processed_inputs["prompt_token_ids"]
assert prompt_token_ids.count(config.boi_token_index) == num_imgs
assert prompt_token_ids.count(config.eoi_token_index) == num_imgs
assert prompt_token_ids.count(vocab[hf_processor.image_token]) == num_imgs
aspect_ratios = mm_kwargs["aspect_ratios"]
num_x_separators = num_y_separators = 0
for tiles_y, tiles_x in aspect_ratios:
if tiles_x * tiles_y > 1:
num_x_separators += (tiles_x - 1) * tiles_y
num_y_separators += tiles_y
assert prompt_token_ids.count(vocab[hf_processor.tile_token]) \
== num_x_separators
assert prompt_token_ids.count(vocab[hf_processor.tile_global_token]) \
== num_y_separators
# image token offsets
img_locs = processed_inputs["mm_placeholders"].get("image", [])
assert len(img_locs) == num_imgs
assert [img_loc["offset"] for img_loc in img_locs] == \
[i for i, v in enumerate(prompt_token_ids) \
if v == config.boi_token_index]
# patch sizes and masks
assert prompt_token_ids.count(config.image_token_index) \
== sum(img_patch.sum() for img_patch in mm_kwargs["embed_is_patch"])
patch_token_id = vocab[hf_processor.img_patch_token]
num_patches = processed_inputs["prompt_token_ids"].count(patch_token_id)
mm_counts = {"image": num_imgs}
assert num_patches / num_imgs <= \
processor.info.get_mm_max_tokens_per_item(32768, mm_counts)["image"]
num_patches_per_chunk = processor.info.get_patch_per_chunk(
config.vision_config)
assert prompt_token_ids.count(config.image_token_index) \
== mm_kwargs["patches_per_image"].sum() * num_patches_per_chunk
assert mm_kwargs["pixel_values"].shape[0] \
== mm_kwargs["patches_per_image"].sum()
for embed_is_patch, aspect_ratio in zip(mm_kwargs["embed_is_patch"],
mm_kwargs["aspect_ratios"]):
assert embed_is_patch.shape[0] == \
len(tokenizer.encode(
hf_processor._prompt_split_image(
aspect_ratio, num_patches_per_chunk),
add_special_tokens=False))
......@@ -287,12 +287,16 @@ _MULTIMODAL_EXAMPLE_MODELS = {
trust_remote_code=True,
hf_overrides={"architectures": ["GLM4VForCausalLM"]}), # noqa: E501
"H2OVLChatModel": _HfExamplesInfo("h2oai/h2ovl-mississippi-800m",
extras={"2b": "h2oai/h2ovl-mississippi-2b"}), # noqa: E501
extras={"2b": "h2oai/h2ovl-mississippi-2b"}, # noqa: E501
max_transformers_version="4.48", # noqa: E501
transformers_version_reason="HF model is not compatible."), # noqa: E501
"InternVLChatModel": _HfExamplesInfo("OpenGVLab/InternVL2-1B",
extras={"2B": "OpenGVLab/InternVL2-2B"}, # noqa: E501
trust_remote_code=True),
"Idefics3ForConditionalGeneration": _HfExamplesInfo("HuggingFaceM4/Idefics3-8B-Llama3", # noqa: E501
{"tiny": "HuggingFaceTB/SmolVLM-256M-Instruct"}), # noqa: E501
"Llama4ForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-4-Scout-17B-16E-Instruct", # noqa: E501
min_transformers_version="4.51"),
"LlavaForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-1.5-7b-hf",
extras={"mistral": "mistral-community/pixtral-12b", # noqa: E501
"mistral-fp8": "nm-testing/pixtral-12b-FP8-dynamic"}), # noqa: E501
......
......@@ -7,6 +7,8 @@ from transformers import PretrainedConfig
from vllm import LLM
from vllm.engine.llm_engine import LLMEngine as V0LLMEngine
from vllm.utils import GiB_bytes
from vllm.v1.core.kv_cache_utils import get_kv_cache_config
from vllm.v1.engine.core import EngineCore as V1EngineCore
from .registry import HF_EXAMPLE_MODELS
......@@ -42,14 +44,21 @@ def test_can_initialize(model_arch):
self.cache_config.num_gpu_blocks = 0
self.cache_config.num_cpu_blocks = 0
def _initalize_kv_caches_v1(self, vllm_config):
# gpu_blocks (> 0), cpu_blocks
return 1, 0
def _initialize_kv_caches_v1(self, vllm_config):
kv_cache_specs = self.model_executor.get_kv_cache_specs()
scheduler_kv_cache_config = get_kv_cache_config(
vllm_config,
kv_cache_specs[0],
20 * GiB_bytes,
)
# gpu_blocks (> 0), cpu_blocks, scheduler_kv_cache_config
return 1, 0, scheduler_kv_cache_config
with (patch.object(V0LLMEngine, "_initialize_kv_caches",
_initialize_kv_caches_v0),
patch.object(V1EngineCore, "_initialize_kv_caches",
_initalize_kv_caches_v1)):
_initialize_kv_caches_v1)):
LLM(
model_info.default,
tokenizer=model_info.tokenizer,
......
......@@ -358,6 +358,8 @@ class ModelConfig:
self.hf_config = hf_config
self.hf_text_config = get_hf_text_config(self.hf_config)
self.attention_chunk_size = getattr(self.hf_text_config,
"attention_chunk_size", None)
self.encoder_config = self._get_encoder_config()
self.hf_image_processor_config = get_hf_image_processor_config(
self.model, hf_token=hf_token, revision=revision)
......
......@@ -500,7 +500,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
"internvl_chat", "skywork_chat", "NVLM_D",
"h2ovl_chat", "idefics3"):
return "<image>"
if model_type == "mllama":
if model_type in ("mllama", "llama4"):
return "<|image|>"
if model_type in ("qwen2_vl", "qwen2_5_vl"):
return "<|vision_start|><|image_pad|><|vision_end|>"
......
{
"1": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 16,
"BLOCK_SIZE_K": 256,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"waves_per_eu": 0,
"matrix_instr_nonkdim": 16,
"kpack": 1
},
"2": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 16,
"BLOCK_SIZE_K": 256,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"waves_per_eu": 0,
"matrix_instr_nonkdim": 16,
"kpack": 1
},
"4": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 16,
"BLOCK_SIZE_K": 256,
"GROUP_SIZE_M": 1,
"num_warps": 2,
"num_stages": 2,
"waves_per_eu": 0,
"matrix_instr_nonkdim": 16,
"kpack": 1
},
"8": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"waves_per_eu": 0,
"matrix_instr_nonkdim": 16,
"kpack": 2
},
"16": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 2,
"num_stages": 2,
"waves_per_eu": 0,
"matrix_instr_nonkdim": 16,
"kpack": 1
},
"24": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 256,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"waves_per_eu": 0,
"matrix_instr_nonkdim": 16,
"kpack": 2
},
"32": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"waves_per_eu": 0,
"matrix_instr_nonkdim": 16,
"kpack": 1
},
"48": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"waves_per_eu": 0,
"matrix_instr_nonkdim": 16,
"kpack": 1
},
"64": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 2,
"num_stages": 2,
"waves_per_eu": 0,
"matrix_instr_nonkdim": 16,
"kpack": 2
},
"96": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 256,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"waves_per_eu": 0,
"matrix_instr_nonkdim": 16,
"kpack": 2
},
"128": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 256,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"waves_per_eu": 0,
"matrix_instr_nonkdim": 16,
"kpack": 2
},
"256": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 256,
"GROUP_SIZE_M": 8,
"num_warps": 4,
"num_stages": 2,
"waves_per_eu": 0,
"matrix_instr_nonkdim": 16,
"kpack": 2
},
"512": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 8,
"num_stages": 2,
"waves_per_eu": 0,
"matrix_instr_nonkdim": 16,
"kpack": 2
},
"1024": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 8,
"num_stages": 2,
"waves_per_eu": 0,
"matrix_instr_nonkdim": 16,
"kpack": 2
},
"1536": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 8,
"num_warps": 8,
"num_stages": 2,
"waves_per_eu": 0,
"matrix_instr_nonkdim": 16,
"kpack": 2
},
"2048": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 8,
"num_stages": 2,
"waves_per_eu": 0,
"matrix_instr_nonkdim": 16,
"kpack": 2
},
"3072": {
"BLOCK_SIZE_M": 128,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"num_warps": 8,
"num_stages": 2,
"waves_per_eu": 0,
"matrix_instr_nonkdim": 16,
"kpack": 2
},
"4096": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"waves_per_eu": 0,
"matrix_instr_nonkdim": 16,
"kpack": 2
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment