Commit ad58e9b3 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.6.1.post2' into v0.6.1.post2-dev

parents 408f663a 9ba0817f
...@@ -6,11 +6,13 @@ prefill requests are chunked. ...@@ -6,11 +6,13 @@ prefill requests are chunked.
Run `pytest tests/models/test_chunked_prefill.py`. Run `pytest tests/models/test_chunked_prefill.py`.
""" """
import os
from contextlib import nullcontext from contextlib import nullcontext
import pytest import pytest
from ..models.utils import check_logprobs_close, check_outputs_equal from ..models.utils import check_logprobs_close, check_outputs_equal
from ..utils import multi_gpu_test
MODELS = [ MODELS = [
"facebook/opt-125m", "facebook/opt-125m",
...@@ -66,6 +68,59 @@ def test_models( ...@@ -66,6 +68,59 @@ def test_models(
) )
@multi_gpu_test(num_gpus=2)
@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
@pytest.mark.parametrize("model", MODELS)
def test_models_distributed(
hf_runner,
vllm_runner,
example_prompts,
model: str,
distributed_executor_backend: str,
) -> None:
if (model == "meta-llama/Llama-2-7b-hf"
and distributed_executor_backend == "ray"):
# test ray adag
os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
dtype = "half"
max_tokens = 5
chunked_prefill_token_size = 16
# Add a chunked prefill config.
max_num_seqs = min(chunked_prefill_token_size, 256)
assert chunked_prefill_token_size != -1
enable_chunked_prefill = True
max_num_batched_tokens = chunked_prefill_token_size
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
with vllm_runner(
model,
dtype=dtype,
tensor_parallel_size=2,
max_num_seqs=max_num_seqs,
enable_chunked_prefill=enable_chunked_prefill,
max_num_batched_tokens=max_num_batched_tokens,
distributed_executor_backend=distributed_executor_backend,
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
check_outputs_equal(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)
@pytest.mark.parametrize( @pytest.mark.parametrize(
"kv_cache_dtype,model", "kv_cache_dtype,model",
[("fp8_e4m3", [("fp8_e4m3",
......
...@@ -19,7 +19,10 @@ MODELS = [ ...@@ -19,7 +19,10 @@ MODELS = [
"facebook/opt-125m", "facebook/opt-125m",
] ]
assert ENABLE_ARTIFICIAL_PREEMPT is True, (
@pytest.fixture(scope="module", autouse=True)
def check_settings():
assert ENABLE_ARTIFICIAL_PREEMPT is True, (
"Use an env var VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1. " "Use an env var VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1. "
"`VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest " "`VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest "
"tests/basic_correctness/test_preemption.py`") "tests/basic_correctness/test_preemption.py`")
...@@ -64,6 +67,7 @@ def test_chunked_prefill_recompute( ...@@ -64,6 +67,7 @@ def test_chunked_prefill_recompute(
enable_chunked_prefill=enable_chunked_prefill, enable_chunked_prefill=enable_chunked_prefill,
max_num_seqs=max_num_seqs, max_num_seqs=max_num_seqs,
worker_use_ray=worker_use_ray, worker_use_ray=worker_use_ray,
disable_log_stats=False,
) as vllm_model: ) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
......
...@@ -16,5 +16,7 @@ def test_full_graph(model): ...@@ -16,5 +16,7 @@ def test_full_graph(model):
"The future of AI is", "The future of AI is",
] ]
sampling_params = SamplingParams(temperature=0) sampling_params = SamplingParams(temperature=0)
llm = LLM(model="meta-llama/Meta-Llama-3-8B") llm = LLM(model="meta-llama/Meta-Llama-3-8B",
enforce_eager=True,
load_format="dummy")
llm.generate(prompts, sampling_params) llm.generate(prompts, sampling_params)
...@@ -6,8 +6,8 @@ import sys ...@@ -6,8 +6,8 @@ import sys
import tempfile import tempfile
from collections import UserList from collections import UserList
from enum import Enum from enum import Enum
from typing import (Any, Callable, Dict, List, Optional, Tuple, TypedDict, from typing import (Any, Callable, Dict, List, Optional, Tuple, Type,
TypeVar, Union) TypedDict, TypeVar, Union)
import numpy as np import numpy as np
import pytest import pytest
...@@ -18,6 +18,7 @@ from huggingface_hub import snapshot_download ...@@ -18,6 +18,7 @@ from huggingface_hub import snapshot_download
from PIL import Image from PIL import Image
from transformers import (AutoModelForCausalLM, AutoTokenizer, BatchEncoding, from transformers import (AutoModelForCausalLM, AutoTokenizer, BatchEncoding,
BatchFeature) BatchFeature)
from transformers.models.auto.auto_factory import _BaseAutoModelClass
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.assets.image import ImageAsset from vllm.assets.image import ImageAsset
...@@ -260,7 +261,7 @@ class HfRunner: ...@@ -260,7 +261,7 @@ class HfRunner:
*, *,
model_kwargs: Optional[Dict[str, Any]] = None, model_kwargs: Optional[Dict[str, Any]] = None,
is_embedding_model: bool = False, is_embedding_model: bool = False,
auto_cls=AutoModelForCausalLM, auto_cls: Type[_BaseAutoModelClass] = AutoModelForCausalLM,
postprocess_inputs: Callable[[BatchEncoding], postprocess_inputs: Callable[[BatchEncoding],
BatchEncoding] = identity, BatchEncoding] = identity,
) -> None: ) -> None:
...@@ -292,7 +293,6 @@ class HfRunner: ...@@ -292,7 +293,6 @@ class HfRunner:
trust_remote_code=True, trust_remote_code=True,
) )
try:
# don't put this import at the top level # don't put this import at the top level
# it will call torch.cuda.device_count() # it will call torch.cuda.device_count()
from transformers import AutoProcessor # noqa: F401 from transformers import AutoProcessor # noqa: F401
...@@ -301,11 +301,6 @@ class HfRunner: ...@@ -301,11 +301,6 @@ class HfRunner:
torch_dtype=torch_dtype, torch_dtype=torch_dtype,
trust_remote_code=True, trust_remote_code=True,
) )
except Exception as exc:
logger.warning(
"Unable to auto-load HuggingFace processor for model (%s). "
"Using tokenizer instead. Reason: %s", model_name, exc)
self.processor = self.tokenizer
self.postprocess_inputs = postprocess_inputs self.postprocess_inputs = postprocess_inputs
...@@ -658,8 +653,8 @@ class VllmRunner: ...@@ -658,8 +653,8 @@ class VllmRunner:
outputs.append((req_sample_output_ids, req_sample_output_strs)) outputs.append((req_sample_output_ids, req_sample_output_strs))
return outputs return outputs
@staticmethod
def _final_steps_generate_w_logprobs( def _final_steps_generate_w_logprobs(
self,
req_outputs: List[RequestOutput], req_outputs: List[RequestOutput],
) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]: ) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]:
outputs: List[Tuple[List[int], str, Optional[SampleLogprobs]]] = [] outputs: List[Tuple[List[int], str, Optional[SampleLogprobs]]] = []
......
"""Compare the outputs of HF and distributed vLLM when using greedy sampling.
Run:
```sh
cd $VLLM_PATH/tests
pytest distributed/test_basic_distributed_correctness.py
```
"""
import os
import pytest
from vllm.utils import cuda_device_count_stateless
from ..models.utils import check_outputs_equal
from ..utils import fork_new_process_for_each_test
TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4")
@pytest.mark.skipif(cuda_device_count_stateless() < 2,
reason="Need at least 2 GPUs to run the test.")
@pytest.mark.parametrize(
"model, distributed_executor_backend, attention_backend, "
"test_suite", [
("facebook/opt-125m", "ray", "", "L4"),
("facebook/opt-125m", "mp", "", "L4"),
("meta-llama/Llama-2-7b-hf", "ray", "", "L4"),
("meta-llama/Llama-2-7b-hf", "mp", "", "L4"),
("facebook/opt-125m", "ray", "", "A100"),
("facebook/opt-125m", "mp", "", "A100"),
("facebook/opt-125m", "mp", "FLASHINFER", "A100"),
("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"),
])
@fork_new_process_for_each_test
def test_models(
hf_runner,
vllm_runner,
example_prompts,
model: str,
distributed_executor_backend: str,
attention_backend: str,
test_suite: str,
) -> None:
if test_suite != TARGET_TEST_SUITE:
pytest.skip(f"Skip test for {test_suite}")
if model == "meta-llama/Llama-2-7b-hf" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4": # noqa
# test ray adag
os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
if attention_backend:
os.environ["VLLM_ATTENTION_BACKEND"] = attention_backend
dtype = "half"
max_tokens = 5
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
with vllm_runner(model,
dtype=dtype,
tensor_parallel_size=2,
distributed_executor_backend=distributed_executor_backend
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
check_outputs_equal(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)
"""For encoder/decoder models only:
Compare the outputs of HF and distributed vLLM when using greedy sampling.
Run:
```sh
cd $VLLM_PATH/tests
pytest distributed/test_basic_distributed_correctness_enc_dec.py
```
"""
import pytest
from transformers import AutoModelForSeq2SeqLM
from vllm.utils import cuda_device_count_stateless
from ..conftest import DecoderPromptType
from ..models.utils import check_logprobs_close
from ..utils import fork_new_process_for_each_test
@pytest.mark.skipif(cuda_device_count_stateless() < 2,
reason="Need at least 2 GPUs to run the test.")
@pytest.mark.parametrize("model, distributed_executor_backend", [
("facebook/bart-large-cnn", "ray"),
("facebook/bart-large-cnn", "mp"),
])
@fork_new_process_for_each_test
def test_models(
model: str,
distributed_executor_backend: str,
hf_runner,
vllm_runner,
example_encoder_decoder_prompts,
) -> None:
'''
Test vLLM BART inference on more than one GPU, comparing
outputs against HF as a baseline.
Fork a new process for each test, to prevent CUDA from
being re-initialized by successive tests within the same
process.
Arguments:
* model: the HF ID of the specific BART variant under test
* distributed_executor_backend
* hf_runner: HuggingFace (HF) test model runner
* vllm_runner: vLLM test model runner
* example_encoder_decoder_prompts: test fixture which provides a
dictionary of dummy prompts
'''
dtype = "float"
max_tokens = 64
num_logprobs = 5
# Example inputs with non-trivial (i.e. not None/empty) encoder &
# decoder prompts.
test_prompts = example_encoder_decoder_prompts[DecoderPromptType.CUSTOM]
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
with vllm_runner(
model,
dtype=dtype,
tensor_parallel_size=2,
distributed_executor_backend=distributed_executor_backend,
enforce_eager=True,
) as vllm_model:
vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs(
test_prompts, max_tokens, num_logprobs)
# Configuration settings for HF baseline
hf_kwargs = {
"top_k": None,
"num_beams": 1,
"repetition_penalty": 1.0,
"top_p": 1.0,
"length_penalty": 1.0,
"early_stopping": False,
"no_repeat_ngram_size": None,
"min_length": 0
}
with hf_runner(model, dtype=dtype,
auto_cls=AutoModelForSeq2SeqLM) as hf_model:
hf_outputs = (hf_model.generate_encoder_decoder_greedy_logprobs_limit(
test_prompts,
max_tokens,
num_logprobs,
**hf_kwargs,
))
check_logprobs_close(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)
"""Compare the outputs of HF and distributed vLLM when using greedy sampling.
Run:
```sh
pytest test_chunked_prefill_distributed.py
```
"""
import os
import pytest
from vllm.utils import cuda_device_count_stateless
from ..models.utils import check_outputs_equal
from ..utils import fork_new_process_for_each_test
@pytest.mark.skipif(cuda_device_count_stateless() < 2,
reason="Need at least 2 GPUs to run the test.")
@pytest.mark.parametrize("model, distributed_executor_backend", [
("facebook/opt-125m", "ray"),
("meta-llama/Llama-2-7b-hf", "ray"),
("facebook/opt-125m", "mp"),
("meta-llama/Llama-2-7b-hf", "mp"),
])
@fork_new_process_for_each_test
def test_models(
hf_runner,
vllm_runner,
example_prompts,
model: str,
distributed_executor_backend: str,
) -> None:
if model == "meta-llama/Llama-2-7b-hf" and distributed_executor_backend == "ray": # noqa
assert distributed_executor_backend == "ray"
# test ray adag
os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
dtype = "half"
max_tokens = 5
chunked_prefill_token_size = 16
# Add a chunked prefill config.
max_num_seqs = min(chunked_prefill_token_size, 256)
assert chunked_prefill_token_size != -1
enable_chunked_prefill = True
max_num_batched_tokens = chunked_prefill_token_size
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
with vllm_runner(
model,
dtype=dtype,
tensor_parallel_size=2,
max_num_seqs=max_num_seqs,
enable_chunked_prefill=enable_chunked_prefill,
max_num_batched_tokens=max_num_batched_tokens,
distributed_executor_backend=distributed_executor_backend,
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
check_outputs_equal(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)
...@@ -32,9 +32,11 @@ VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1" ...@@ -32,9 +32,11 @@ VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
(1, 4, 1, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray"), (1, 4, 1, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
(2, 2, 1, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray"), (2, 2, 1, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
(2, 2, 0, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray"), (2, 2, 0, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
(1, 2, 1, 1, 1, "OpenGVLab/InternVL2-1B", "ray"), # NOTE: InternVL2 multi-node tests are flaky,
(1, 2, 1, 1, 1, "OpenGVLab/InternVL2-2B", "ray"), # use mp backend to skip the multi-node tests
(1, 2, 1, 0, 1, "OpenGVLab/InternVL2-4B", "ray"), (1, 2, 1, 1, 1, "OpenGVLab/InternVL2-1B", "mp"),
(1, 2, 1, 1, 1, "OpenGVLab/InternVL2-2B", "mp"),
(1, 2, 1, 0, 1, "OpenGVLab/InternVL2-4B", "mp"),
], ],
) )
@fork_new_process_for_each_test @fork_new_process_for_each_test
......
import os import os
import torch import torch.distributed as dist
from vllm.distributed.parallel_state import in_the_same_node_as from vllm.distributed.parallel_state import in_the_same_node_as
torch.distributed.init_process_group(backend="gloo") if __name__ == "__main__":
test_result = all( dist.init_process_group(backend="gloo")
in_the_same_node_as(torch.distributed.group.WORLD, source_rank=0)) test_result = all(in_the_same_node_as(dist.group.WORLD, source_rank=0))
expected = os.environ.get("VLLM_TEST_SAME_HOST", "1") == "1" expected = os.environ.get("VLLM_TEST_SAME_HOST", "1") == "1"
assert test_result == expected, f"Expected {expected}, got {test_result}" assert test_result == expected, f"Expected {expected}, got {test_result}"
print("Same node test passed!") print("Same node test passed!")
...@@ -11,9 +11,10 @@ def test_skip_tokenizer_initialization(model: str): ...@@ -11,9 +11,10 @@ def test_skip_tokenizer_initialization(model: str):
# token ids. # token ids.
llm = LLM(model=model, skip_tokenizer_init=True) llm = LLM(model=model, skip_tokenizer_init=True)
sampling_params = SamplingParams(prompt_logprobs=True, detokenize=True) sampling_params = SamplingParams(prompt_logprobs=True, detokenize=True)
with pytest.raises(ValueError) as err:
with pytest.raises(ValueError, match="cannot pass text prompts when"):
llm.generate("abc", sampling_params) llm.generate("abc", sampling_params)
assert "prompts must be None if" in str(err.value)
outputs = llm.generate({"prompt_token_ids": [1, 2, 3]}, outputs = llm.generate({"prompt_token_ids": [1, 2, 3]},
sampling_params=sampling_params) sampling_params=sampling_params)
assert len(outputs) > 0 assert len(outputs) > 0
......
"""Tests for HF_HUB_OFFLINE mode"""
import importlib
import sys
import weakref
import pytest
from vllm import LLM
from ...conftest import cleanup
MODEL_NAME = "facebook/opt-125m"
@pytest.fixture(scope="module")
def llm():
# pytest caches the fixture so we use weakref.proxy to
# enable garbage collection
llm = LLM(model=MODEL_NAME,
max_num_batched_tokens=4096,
tensor_parallel_size=1,
gpu_memory_utilization=0.10,
enforce_eager=True)
with llm.deprecate_legacy_api():
yield weakref.proxy(llm)
del llm
cleanup()
@pytest.mark.skip_global_cleanup
def test_offline_mode(llm: LLM, monkeypatch):
# we use the llm fixture to ensure the model files are in-cache
del llm
# Set HF to offline mode and ensure we can still construct an LLM
try:
monkeypatch.setenv("HF_HUB_OFFLINE", "1")
# Need to re-import huggingface_hub and friends to setup offline mode
_re_import_modules()
# Cached model files should be used in offline mode
LLM(model=MODEL_NAME,
max_num_batched_tokens=4096,
tensor_parallel_size=1,
gpu_memory_utilization=0.10,
enforce_eager=True)
finally:
# Reset the environment after the test
# NB: Assuming tests are run in online mode
monkeypatch.delenv("HF_HUB_OFFLINE")
_re_import_modules()
pass
def _re_import_modules():
hf_hub_module_names = [
k for k in sys.modules if k.startswith("huggingface_hub")
]
transformers_module_names = [
k for k in sys.modules if k.startswith("transformers")
and not k.startswith("transformers_modules")
]
reload_exception = None
for module_name in hf_hub_module_names + transformers_module_names:
try:
importlib.reload(sys.modules[module_name])
except Exception as e:
reload_exception = e
# Try to continue clean up so that other tests are less likely to
# be affected
# Error this test if reloading a module failed
if reload_exception is not None:
raise reload_exception
...@@ -10,7 +10,6 @@ import pytest ...@@ -10,7 +10,6 @@ import pytest
import torch import torch
from vllm.attention import AttentionBackend, AttentionMetadata, AttentionType from vllm.attention import AttentionBackend, AttentionMetadata, AttentionType
from vllm.attention.backends.xformers import XFormersBackend
from vllm.utils import (STR_BACKEND_ENV_VAR, STR_XFORMERS_ATTN_VAL, from vllm.utils import (STR_BACKEND_ENV_VAR, STR_XFORMERS_ATTN_VAL,
make_tensor_with_pad) make_tensor_with_pad)
...@@ -521,6 +520,9 @@ def make_backend(backend_name: str) -> AttentionBackend: ...@@ -521,6 +520,9 @@ def make_backend(backend_name: str) -> AttentionBackend:
* Backend instance * Backend instance
''' '''
if backend_name == STR_XFORMERS_ATTN_VAL: if backend_name == STR_XFORMERS_ATTN_VAL:
# NOTE: xFormers backend cannot be imported for CPU and AMD GPUs.
from vllm.attention.backends.xformers import XFormersBackend
return XFormersBackend() return XFormersBackend()
raise AssertionError( raise AssertionError(
f"Unrecognized backend_name {backend_name} for unit test") f"Unrecognized backend_name {backend_name} for unit test")
......
...@@ -7,10 +7,8 @@ from transformers import AutoModel, AutoTokenizer, BatchEncoding ...@@ -7,10 +7,8 @@ from transformers import AutoModel, AutoTokenizer, BatchEncoding
from vllm.sequence import SampleLogprobs from vllm.sequence import SampleLogprobs
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
from ..conftest import HfRunner, VllmRunner from ....conftest import HfRunner, VllmRunner
from .utils import check_logprobs_close from ...utils import check_logprobs_close
pytestmark = pytest.mark.vlm
MODEL_NAME = "fixie-ai/ultravox-v0_3" MODEL_NAME = "fixie-ai/ultravox-v0_3"
......
...@@ -7,7 +7,7 @@ Run `pytest tests/models/test_big_models.py`. ...@@ -7,7 +7,7 @@ Run `pytest tests/models/test_big_models.py`.
import pytest import pytest
import torch import torch
from .utils import check_outputs_equal from ...utils import check_outputs_equal
MODELS = [ MODELS = [
"meta-llama/Llama-2-7b-hf", "meta-llama/Llama-2-7b-hf",
......
...@@ -6,7 +6,7 @@ Run `pytest tests/models/test_danube3_4b.py`. ...@@ -6,7 +6,7 @@ Run `pytest tests/models/test_danube3_4b.py`.
""" """
import pytest import pytest
from .utils import check_outputs_equal from ...utils import check_outputs_equal
MODELS = ["h2oai/h2o-danube3-4b-base"] MODELS = ["h2oai/h2o-danube3-4b-base"]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment