Commit ec5e299c authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.7.3' into v0.7.3-dev

parents 47bd229c ed6e9075
......@@ -99,7 +99,7 @@ def test_register_quantization_config():
@pytest.mark.parametrize(argnames="model",
argvalues=[
"meta-llama/Meta-Llama-3-8B-Instruct",
"meta-llama/Llama-3.2-1B-Instruct",
])
def test_custom_quant(vllm_runner, model):
"""Test infer with the custom quantization method."""
......
......@@ -12,7 +12,7 @@ from ..utils import models_path_prefix
# We also test with llama because it has generation_config to specify EOS
# (past regression).
MODELS = [os.path.join(models_path_prefix, "facebook/opt-125m"), os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-hf")]
MODELS = [os.path.join(models_path_prefix, "distilbert/distilgpt2"), os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B")]
@pytest.mark.parametrize("model", MODELS)
......
......@@ -7,7 +7,7 @@ import os
from vllm import SamplingParams
from ..utils import models_path_prefix
MODELS = [os.path.join(models_path_prefix, "facebook/opt-125m")]
MODELS = [os.path.join(models_path_prefix, "distilbert/distilgpt2")]
@pytest.mark.parametrize("model", MODELS)
......
......@@ -11,7 +11,7 @@ from vllm import SamplingParams
from ..conftest import VllmRunner
from ..utils import models_path_prefix
MODELS = [os.path.join(models_path_prefix, "facebook/opt-125m")]
MODELS = [os.path.join(models_path_prefix, "distilbert/distilgpt2")]
@pytest.mark.parametrize("model", MODELS)
......
......@@ -77,7 +77,7 @@ class TestOneTokenBadWord:
class TestTwoTokenBadWord:
# Another model (with a different tokenizer behaviour)
MODEL = os.path.join(models_path_prefix, "openai-community/gpt2")
MODEL = os.path.join(models_path_prefix, "distilbert/distilgpt2")
PROMPT = "How old are you? I am 10"
TARGET_TOKEN1 = "years"
......
......@@ -6,7 +6,7 @@ import os
from vllm import SamplingParams
from ..utils import models_path_prefix
MODELS = [os.path.join(models_path_prefix, "facebook/opt-125m")]
MODELS = [os.path.join(models_path_prefix, "distilbert/distilgpt2")]
@pytest.mark.parametrize("model", MODELS)
......
......@@ -10,7 +10,7 @@ from ...utils import models_path_prefix
@pytest.mark.parametrize("common_llm_kwargs", [{
"model": os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-chat-hf"),
"model": os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"),
"speculative_model": "JackFram/llama-68m",
"num_speculative_tokens": 5,
}])
......@@ -29,8 +29,8 @@ from ...utils import models_path_prefix
},
{
# Speculative max model len > target max model len should raise.
# https://huggingface.co/meta-llama/Llama-2-7b-chat-hf/blob/f5db02db724555f92da89c216ac04704f23d4590/config.json#L12
"speculative_max_model_len": 4096 + 1,
# https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct/blob/9213176726f574b556790deb65791e0c5aa438b6/config.json#L18
"speculative_max_model_len": 131072 + 1,
},
])
@pytest.mark.parametrize("test_llm_kwargs", [{}])
......
......@@ -308,6 +308,150 @@ def test_eagle_disable_queue(vllm_runner, common_llm_kwargs,
batch_size, output_len, seed)
@pytest.mark.parametrize(
"common_llm_kwargs",
[{
# Skip cuda graph recording for fast test.
"enforce_eager": True,
# Print spec metrics.
"disable_log_stats": False,
# Precision
"dtype": "float16",
# Main model
"model_name": "meta-llama/Llama-2-7b-chat-hf",
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [
{
"speculative_model": "yuhuili/EAGLE-llama2-chat-7B",
"num_speculative_tokens": MAX_SPEC_TOKENS,
},
])
@pytest.mark.parametrize(
"output_len",
[
# Use smaller output len for fast test.
32,
])
@pytest.mark.parametrize("batch_size", [1, 5])
@pytest.mark.parametrize("seed", [1])
def test_llama2_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
per_test_common_llm_kwargs,
baseline_llm_kwargs,
test_llm_kwargs, batch_size: int,
output_len: int, seed: int):
run_equality_correctness_test(vllm_runner,
common_llm_kwargs,
per_test_common_llm_kwargs,
baseline_llm_kwargs,
test_llm_kwargs,
batch_size,
output_len,
seed,
temperature=0.0)
@pytest.mark.parametrize(
"common_llm_kwargs",
[{
# Skip cuda graph recording for fast test.
"enforce_eager": True,
# Print spec metrics.
"disable_log_stats": False,
# Precision
"dtype": "float16",
# Main model
"model_name": "meta-llama/Meta-Llama-3-8B-Instruct",
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [
{
"speculative_model": "yuhuili/EAGLE-LLaMA3-Instruct-8B",
"num_speculative_tokens": MAX_SPEC_TOKENS,
},
])
@pytest.mark.parametrize(
"output_len",
[
# Use smaller output len for fast test.
32,
])
@pytest.mark.parametrize("batch_size", [1, 5])
@pytest.mark.parametrize("seed", [1])
def test_llama3_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
per_test_common_llm_kwargs,
baseline_llm_kwargs,
test_llm_kwargs, batch_size: int,
output_len: int, seed: int):
run_equality_correctness_test(vllm_runner,
common_llm_kwargs,
per_test_common_llm_kwargs,
baseline_llm_kwargs,
test_llm_kwargs,
batch_size,
output_len,
seed,
temperature=0.0)
@pytest.mark.parametrize(
"common_llm_kwargs",
[{
# Skip cuda graph recording for fast test.
"enforce_eager": True,
# Print spec metrics.
"disable_log_stats": False,
# Precision
"dtype": "float16",
# Main model
"model_name": "Qwen/Qwen2-7B-Instruct",
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [
{
"speculative_model": "yuhuili/EAGLE-Qwen2-7B-Instruct",
"num_speculative_tokens": MAX_SPEC_TOKENS,
},
])
@pytest.mark.parametrize(
"output_len",
[
# Use smaller output len for fast test.
32,
])
@pytest.mark.parametrize("batch_size", [1, 5])
@pytest.mark.parametrize("seed", [1])
def test_qwen2_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
per_test_common_llm_kwargs,
baseline_llm_kwargs,
test_llm_kwargs, batch_size: int,
output_len: int, seed: int):
run_equality_correctness_test(vllm_runner,
common_llm_kwargs,
per_test_common_llm_kwargs,
baseline_llm_kwargs,
test_llm_kwargs,
batch_size,
output_len,
seed,
temperature=0.0)
if __name__ == "__main__":
import pytest
pytest.main([__file__])
# SPDX-License-Identifier: Apache-2.0
"""This docstring details important information on the testing methodology.
Most of the tests rely on "greedy equality", where we expect the output of
speculative decoding on a sequence to exactly match the output of normal non-
speculative decoding.
Since speculative decoding with rejection sampling guarantees that the output
distribution matches the target model's output distribution (up to hardware
numerics, see https://arxiv.org/pdf/2302.01318.pdf), we can expect greedy
equality.
However, we still need to verify below scenario could be passed:
* Batch size 1 greedy equality
* Batch size >1 greedy equality
* Test greedy equality under preemption
* Test greedy equality under various number of speculative tokens.
With those tests, we can say at least, mtp would not break the
correctess for the target model outputs.
"""
import pytest
from .conftest import run_equality_correctness_test
# main model
MAIN_MODEL = "luccafong/deepseek_mtp_main_random"
# max. number of speculative tokens: this corresponds to
# num_nextn_predict_layers in the config.json of the speculator model.
MAX_SPEC_TOKENS = 1
# precision
PRECISION = "bfloat16"
@pytest.mark.parametrize(
"common_llm_kwargs",
[{
# Skip cuda graph recording for fast test.
"enforce_eager": True,
# Print spec metrics.
"disable_log_stats": False,
# Precision
"dtype": PRECISION,
# Main model
"model_name": MAIN_MODEL,
# GPU memory utilization
"gpu_memory_utilization": 0.85
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [
{
"num_speculative_tokens": MAX_SPEC_TOKENS,
},
])
@pytest.mark.parametrize("output_len", [
128,
])
@pytest.mark.parametrize("batch_size", [1, 32])
@pytest.mark.parametrize("seed", [1])
def test_mtp_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
per_test_common_llm_kwargs,
baseline_llm_kwargs, test_llm_kwargs,
batch_size: int, output_len: int,
seed: int):
run_equality_correctness_test(vllm_runner, common_llm_kwargs,
per_test_common_llm_kwargs,
baseline_llm_kwargs, test_llm_kwargs,
batch_size, output_len, seed)
@pytest.mark.parametrize(
"common_llm_kwargs",
[{
# Skip cuda graph recording for fast test.
"enforce_eager": True,
# Print spec metrics.
"disable_log_stats": False,
# Precision
"dtype": PRECISION,
# Main model
"model_name": MAIN_MODEL,
# GPU memory utilization
"gpu_memory_utilization": 0.85
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [
{
"num_speculative_tokens": MAX_SPEC_TOKENS,
"disable_logprobs_during_spec_decoding": False,
},
{
"num_speculative_tokens": MAX_SPEC_TOKENS,
"disable_logprobs_during_spec_decoding": True,
},
])
@pytest.mark.parametrize("output_len", [
128,
])
@pytest.mark.parametrize("batch_size", [8])
@pytest.mark.parametrize("seed", [1])
@pytest.mark.parametrize("logprobs", [1, 6])
def test_mtp_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
per_test_common_llm_kwargs,
baseline_llm_kwargs, test_llm_kwargs,
batch_size: int, output_len: int, seed: int,
logprobs: int):
run_equality_correctness_test(vllm_runner,
common_llm_kwargs,
per_test_common_llm_kwargs,
baseline_llm_kwargs,
test_llm_kwargs,
batch_size,
output_len,
seed,
logprobs=logprobs,
prompt_logprobs=logprobs,
disable_logprobs=test_llm_kwargs[
'disable_logprobs_during_spec_decoding'])
@pytest.mark.parametrize(
"common_llm_kwargs",
[{
"enforce_eager": False,
# Print spec metrics.
"disable_log_stats": False,
# Precision
"dtype": PRECISION,
# Main model
"model_name": MAIN_MODEL,
"gpu_memory_utilization": 0.85
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [
{
"num_speculative_tokens": MAX_SPEC_TOKENS,
},
])
@pytest.mark.parametrize("output_len", [
128,
])
@pytest.mark.parametrize("batch_size", [1, 32])
@pytest.mark.parametrize("seed", [1])
def test_mtp_e2e_greedy_correctness_cuda_graph(vllm_runner, common_llm_kwargs,
per_test_common_llm_kwargs,
baseline_llm_kwargs,
test_llm_kwargs,
batch_size: int,
output_len: int, seed: int):
"""Verify greedy equality with cuda graph enabled and different
batch sizes."""
run_equality_correctness_test(vllm_runner, common_llm_kwargs,
per_test_common_llm_kwargs,
baseline_llm_kwargs, test_llm_kwargs,
batch_size, output_len, seed)
@pytest.mark.parametrize(
"common_llm_kwargs",
[{
"block_size": 8,
# 2 for small prompt, 256//8 for generated.
"num_gpu_blocks_override": 2 + 256 // 8,
"max_model_len": (2 + 256 // 8) * 8,
# Skip cuda graph recording for fast test.
"enforce_eager": True,
# Precision
"dtype": PRECISION,
# Main model
"model_name": MAIN_MODEL,
# GPU memory utilization
"gpu_memory_utilization": 0.9
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs", [
{
"num_speculative_tokens": MAX_SPEC_TOKENS,
},
])
@pytest.mark.parametrize(
"output_len",
[
# Use small output len for fast test.
128,
])
@pytest.mark.parametrize("batch_size", [4])
@pytest.mark.parametrize("seed", [1])
def test_mtp_e2e_greedy_correctness_with_preemption(
vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int,
seed: int):
"""Verify greedy equality, even when some sequences are preempted mid-
generation.
"""
run_equality_correctness_test(vllm_runner, common_llm_kwargs,
per_test_common_llm_kwargs,
baseline_llm_kwargs, test_llm_kwargs,
batch_size, output_len, seed)
@pytest.mark.parametrize(
"common_llm_kwargs",
[{
# Skip cuda graph recording for fast test.
"enforce_eager": True,
# Precision
"dtype": PRECISION,
# Main model
"model_name": MAIN_MODEL,
# GPU memory utilization
"gpu_memory_utilization": 0.9
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize(
"test_llm_kwargs",
[
{
"num_speculative_tokens": k,
}
# Try a range of num. speculative tokens
for k in range(1, 1 + MAX_SPEC_TOKENS)
])
@pytest.mark.parametrize("batch_size", [2])
@pytest.mark.parametrize(
"output_len",
[
# Use smaller output len for fast test.
32,
])
@pytest.mark.parametrize("seed", [1])
def test_mtp_different_k(vllm_runner, common_llm_kwargs,
per_test_common_llm_kwargs, baseline_llm_kwargs,
test_llm_kwargs, batch_size: int, output_len: int,
seed: int):
"""Verify that mtp speculative decoding produces exact equality
to without spec decode with different values of num_speculative_tokens.
"""
run_equality_correctness_test(vllm_runner, common_llm_kwargs,
per_test_common_llm_kwargs,
baseline_llm_kwargs, test_llm_kwargs,
batch_size, output_len, seed)
@pytest.mark.parametrize(
"common_llm_kwargs",
[{
# Skip cuda graph recording for fast test.
"enforce_eager": True,
# Precision
"dtype": PRECISION,
# Main model
"model_name": MAIN_MODEL,
# GPU memory utilization
"gpu_memory_utilization": 0.9
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@pytest.mark.parametrize("test_llm_kwargs",
[{
"num_speculative_tokens": MAX_SPEC_TOKENS,
"speculative_disable_by_batch_size": 4
}])
@pytest.mark.parametrize("batch_size", [1, 5])
@pytest.mark.parametrize(
"output_len",
[
# Use smaller output len for fast test.
32,
])
@pytest.mark.parametrize("seed", [1])
def test_mtp_disable_queue(vllm_runner, common_llm_kwargs,
per_test_common_llm_kwargs, baseline_llm_kwargs,
test_llm_kwargs, batch_size: int, output_len: int,
seed: int):
"""Verify that mtp speculative decoding produces exact equality
to without spec decode when speculation is disabled for large
batch sizes.
"""
run_equality_correctness_test(vllm_runner, common_llm_kwargs,
per_test_common_llm_kwargs,
baseline_llm_kwargs, test_llm_kwargs,
batch_size, output_len, seed)
if __name__ == "__main__":
import pytest
pytest.main([__file__])
......@@ -13,15 +13,18 @@ from vllm.model_executor.layers.sampler import SamplerOutput
from vllm.model_executor.utils import set_random_seed
from vllm.sequence import ExecuteModelRequest, SequenceOutput
from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer
from vllm.spec_decode.draft_model_runner import TP1DraftModelRunner
from vllm.spec_decode.interfaces import SpeculativeProposals
from vllm.spec_decode.metrics import (AsyncMetricsCollector,
SpecDecodeWorkerMetrics)
from vllm.spec_decode.multi_step_worker import MultiStepWorker
from vllm.spec_decode.spec_decode_worker import (SpecDecodeWorker,
split_num_cache_blocks_evenly)
from vllm.worker.worker import Worker
from .test_utils import mock_spec_decode_sampler
from .utils import create_batch, create_sampler_output_list, mock_worker
from .utils import (create_batch, create_sampler_output_list, create_worker,
mock_worker)
@pytest.mark.parametrize('k', [1, 2, 6])
......@@ -905,3 +908,38 @@ def test_chunked_prefill_flow(k: int, batch_size: int, batch_composition: str):
worker.execute_model(execute_model_req=execute_model_req)
# but first draft still counted
assert draft_worker.get_spec_proposals.call_count == 1
def test_correctly_load_weight_for_eagle():
"""
Verify SpecDecodeWorker loads lm_head weight for eagle correctly.
"""
seed = 100
block_size = 32
num_gpu_blocks = 8096 // block_size
target_worker = create_worker(
Worker,
"JackFram/llama-68m",
block_size,
num_gpu_blocks,
seed,
)
draft_worker = create_worker(
MultiStepWorker,
"abhigoyal/vllm-eagle-llama-68m-random",
block_size,
num_gpu_blocks,
seed,
model_runner_cls=TP1DraftModelRunner,
)
spec_decode_sampler = mock_spec_decode_sampler("rejection_sampler")
worker = SpecDecodeWorker(draft_worker,
target_worker,
spec_decode_sampler,
disable_logprobs=False)
worker.proposer_worker.maybe_load_lm_head_weight(
target_worker.model_runner.model.lm_head.weight.data)
assert torch.allclose(
worker.proposer_worker.worker.model_runner.model.lm_head.weight.data,
worker.scorer_worker.model_runner.model.lm_head.weight.data)
......@@ -10,15 +10,22 @@ from vllm.model_executor.layers.pooler import PoolingType
from vllm.platforms import current_platform
from utils import models_path_prefix
from .conftest import MODEL_WEIGHTS_S3_BUCKET
@pytest.mark.parametrize(
("model_id", "expected_runner_type", "expected_task"),
[
(os.path.join(models_path_prefix, "facebook/opt-125m"), "generate", "generate"),
(os.path.join(models_path_prefix, "intfloat/e5-mistral-7b-instruct"), "pooling", "embed"),
(os.path.join(models_path_prefix, "jason9693/Qwen2.5-1.5B-apeach"), "pooling", "classify"),
(os.path.join(models_path_prefix, "cross-encoder/ms-marco-MiniLM-L-6-v2"), "pooling", "score"),
(f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2", "generate",
"generate"),
(f"{MODEL_WEIGHTS_S3_BUCKET}/intfloat/e5-mistral-7b-instruct",
"pooling", "embed"),
(f"{MODEL_WEIGHTS_S3_BUCKET}/jason9693/Qwen2.5-1.5B-apeach", "pooling",
"classify"),
(f"{MODEL_WEIGHTS_S3_BUCKET}/cross-encoder/ms-marco-MiniLM-L-6-v2",
"pooling", "score"),
(os.path.join(models_path_prefix, "Qwen/Qwen2.5-Math-RM-72B"), "pooling", "reward"),
(os.path.join(models_path_prefix, "openai/whisper-small"), "transcription", "transcription"),
],
)
def test_auto_task(model_id, expected_runner_type, expected_task):
......@@ -252,7 +259,7 @@ def test_rope_customization():
@pytest.mark.parametrize(("model_id", "is_encoder_decoder"), [
("facebook/opt-125m", False),
("facebook/bart-base", True),
("meta-llama/Llama-3.2-1B", False),
("meta-llama/Llama-3.2-1B-Instruct", False),
("meta-llama/Llama-3.2-11B-Vision", True),
])
def test_is_encoder_decoder(model_id, is_encoder_decoder):
......
......@@ -12,6 +12,9 @@ import torch
from vllm import LLM, SamplingParams
from utils import models_path_prefix
import os
from vllm.config import LoadFormat
from .conftest import MODEL_WEIGHTS_S3_BUCKET
def test_duplicated_ignored_sequence_group():
......@@ -20,7 +23,8 @@ def test_duplicated_ignored_sequence_group():
sampling_params = SamplingParams(temperature=0.01,
top_p=0.1,
max_tokens=256)
llm = LLM(model=os.path.join(models_path_prefix, "facebook/opt-125m"),
llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2",
load_format=LoadFormat.RUNAI_STREAMER,
max_num_batched_tokens=4096,
tensor_parallel_size=1)
prompts = ["This is a short prompt", "This is a very long prompt " * 1000]
......@@ -33,7 +37,8 @@ def test_max_tokens_none():
sampling_params = SamplingParams(temperature=0.01,
top_p=0.1,
max_tokens=None)
llm = LLM(model=os.path.join(models_path_prefix, "facebook/opt-125m"),
llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2",
load_format=LoadFormat.RUNAI_STREAMER,
max_num_batched_tokens=4096,
tensor_parallel_size=1)
prompts = ["Just say hello!"]
......@@ -43,7 +48,9 @@ def test_max_tokens_none():
def test_gc():
llm = LLM(os.path.join(models_path_prefix, "facebook/opt-125m"), enforce_eager=True)
llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/distilbert/distilgpt2",
load_format=LoadFormat.RUNAI_STREAMER,
enforce_eager=True)
del llm
gc.collect()
......
......@@ -11,6 +11,7 @@ from vllm.scalar_type import scalar_types
(0, 15, scalar_types.uint4),
(-8, 7, scalar_types.uint4b8),
(-128, 127, scalar_types.uint8b128),
(-6., 6., scalar_types.float4_e2m1fn),
(-28., 28., scalar_types.float6_e3m2f),
(torch.int8, scalar_types.int8),
(torch.uint8, scalar_types.uint8),
......
# SPDX-License-Identifier: Apache-2.0
import random
import numpy as np
import torch
from vllm.platforms.interface import Platform
def test_seed_behavior():
# Test with a specific seed
Platform.seed_everything(42)
random_value_1 = random.randint(0, 100)
np_random_value_1 = np.random.randint(0, 100)
torch_random_value_1 = torch.randint(0, 100, (1, )).item()
Platform.seed_everything(42)
random_value_2 = random.randint(0, 100)
np_random_value_2 = np.random.randint(0, 100)
torch_random_value_2 = torch.randint(0, 100, (1, )).item()
assert random_value_1 == random_value_2
assert np_random_value_1 == np_random_value_2
assert torch_random_value_1 == torch_random_value_2
......@@ -47,12 +47,12 @@ def test_filter_subtensors():
@pytest.fixture(scope="module")
def llama_2_7b_files():
def llama_3p2_1b_files():
with TemporaryDirectory() as cache_dir:
# input_dir = snapshot_download("meta-llama/Llama-3.2-1B",
# input_dir = snapshot_download("meta-llama/Llama-3.2-1B-Instruct",
# cache_dir=cache_dir,
# ignore_patterns=["*.bin*", "original/*"])
input_dir = os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B")
input_dir = os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct")
yield input_dir
......@@ -82,13 +82,13 @@ def _run_generate(input_dir, queue: mp.Queue, **kwargs):
@pytest.mark.parametrize("enable_lora", [False, True])
@pytest.mark.parametrize("tp_size", [1, 2])
def test_sharded_state_loader(enable_lora, tp_size, num_gpus_available,
llama_2_7b_files):
llama_3p2_1b_files):
if num_gpus_available < tp_size:
pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
weights_patterns = ("*.safetensors", )
gpu_memory_utilization = 0.8
input_dir = llama_2_7b_files
input_dir = llama_3p2_1b_files
ctx = mp.get_context("spawn")
# Run in separate processes for memory & CUDA isolation
......
......@@ -34,7 +34,7 @@ TOKENIZERS = [
os.path.join(models_path_prefix, "bigscience/bloom-560m"),
os.path.join(models_path_prefix, "mosaicml/mpt-7b"),
os.path.join(models_path_prefix, "tiiuae/falcon-7b"),
os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-hf"),
os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"),
os.path.join(models_path_prefix, "codellama/CodeLlama-7b-hf"),
os.path.join(models_path_prefix, "mistralai/Pixtral-12B-2409"),
]
......
......@@ -23,7 +23,7 @@ def test_get_llama3_eos_token():
generation_config = try_get_generation_config(model_name,
trust_remote_code=False)
assert generation_config is not None
assert generation_config.eos_token_id == [128001, 128009]
assert generation_config.eos_token_id == [128001, 128008, 128009]
def test_get_blip2_eos_token():
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment