Commit 3c9817d2 authored by zhuwenwen's avatar zhuwenwen
Browse files

add VLLM_OPTEST_MODELS_PATH/OPTEST_MODELS_PATH to load models from local path...

add VLLM_OPTEST_MODELS_PATH/OPTEST_MODELS_PATH  to load models from local path instead of Hugging Face Hub
parent 49204f68
......@@ -6,6 +6,8 @@ from pathlib import Path
import pytest
import requests
import os
from ..utils import models_path_prefix
def _query_server(prompt: str, max_tokens: int = 5) -> dict:
......@@ -30,7 +32,7 @@ def api_server(tokenizer_pool_size: int, worker_use_ray: bool):
"api_server_async_engine.py").absolute()
commands = [
sys.executable, "-u",
str(script_path), "--model", "facebook/opt-125m", "--host",
str(script_path), "--model", os.path.join(models_path_prefix, "facebook/opt-125m"), "--host",
"127.0.0.1", "--tokenizer-pool-size",
str(tokenizer_pool_size)
]
......
......@@ -18,7 +18,8 @@ from vllm.sampling_params import RequestOutputKind
from ..conftest import cleanup
from ..utils import wait_for_gpu_memory_to_clear
import os
from ..utils import models_path_prefix
@dataclass
class RequestOutput:
......@@ -136,7 +137,7 @@ def start_engine():
print(f"Starting engine with num_scheduler_steps={num_scheduler_steps}")
return AsyncLLMEngine.from_engine_args(
AsyncEngineArgs(model="facebook/opt-125m",
AsyncEngineArgs(model=os.path.join(models_path_prefix, "facebook/opt-125m"),
enforce_eager=True,
num_scheduler_steps=num_scheduler_steps))
......
......@@ -16,10 +16,12 @@ from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
from ..models.utils import check_outputs_equal
from ..utils import multi_gpu_test
import os
from ..utils import models_path_prefix
MODELS = [
"facebook/opt-125m",
"meta-llama/Llama-2-7b-hf",
os.path.join(models_path_prefix, "facebook/opt-125m"),
os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-hf"),
]
TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4")
......@@ -27,7 +29,7 @@ TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4")
def test_vllm_gc_ed():
"""Verify vllm instance is GC'ed when it is deleted"""
llm = LLM("facebook/opt-125m")
llm = LLM(os.path.join(models_path_prefix, "facebook/opt-125m"))
weak_llm = weakref.ref(llm)
del llm
# If there's any circular reference to vllm, this fails
......@@ -78,14 +80,14 @@ def test_models(
# @pytest.mark.parametrize(
# "model, distributed_executor_backend, attention_backend, "
# "test_suite", [
# ("facebook/opt-125m", "ray", "", "L4"),
# ("facebook/opt-125m", "mp", "", "L4"),
# ("meta-llama/Llama-2-7b-hf", "ray", "", "L4"),
# ("meta-llama/Llama-2-7b-hf", "mp", "", "L4"),
# ("facebook/opt-125m", "ray", "", "A100"),
# ("facebook/opt-125m", "mp", "", "A100"),
# ("facebook/opt-125m", "mp", "FLASHINFER", "A100"),
# ("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"),
# (os.path.join(models_path_prefix, "facebook/opt-125m"), "ray", "", "L4"),
# (os.path.join(models_path_prefix, "facebook/opt-125m"), "mp", "", "L4"),
# (os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-hf"), "ray", "", "L4"),
# (os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-hf"), "mp", "", "L4"),
# (os.path.join(models_path_prefix, "facebook/opt-125m"), "ray", "", "A100"),
# (os.path.join(models_path_prefix, "facebook/opt-125m"), "mp", "", "A100"),
# (os.path.join(models_path_prefix, "facebook/opt-125m"), "mp", "FLASHINFER", "A100"),
# (os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B"), "ray", "FLASHINFER", "A100"),
# ])
# def test_models_distributed(
# hf_runner,
......@@ -138,7 +140,7 @@ def test_model_with_failure(vllm_runner) -> None:
with patch("vllm.model_executor.models.opt.OPTForCausalLM.forward",
side_effect=ValueError()):
with pytest.raises(ValueError) as exc_info:
vllm_runner("facebook/opt-125m",
vllm_runner(os.path.join(models_path_prefix, "facebook/opt-125m"),
dtype="half",
enforce_eager=False,
gpu_memory_utilization=0.7)
......
......@@ -13,10 +13,12 @@ import pytest
from ..models.utils import check_logprobs_close, check_outputs_equal
from ..utils import multi_gpu_test
import os
from ..utils import models_path_prefix
MODELS = [
"facebook/opt-125m",
"meta-llama/Llama-2-7b-hf",
os.path.join(models_path_prefix, "facebook/opt-125m"),
os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-hf"),
]
......@@ -207,7 +209,7 @@ def test_with_prefix_caching(
Checks exact match decode with and without prefix caching
with chunked prefill enabled.
"""
model = "meta-llama/Llama-2-7b-chat-hf"
model = os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-chat-hf")
# The common prompt has 142 tokens with Llama-2 tokenizer.
common_prompt = "You are a helpful AI assistant " * 20
unique_prompts = [
......
from ..utils import compare_two_settings
import os
from ..utils import compare_two_settings, models_path_prefix
def test_cpu_offload():
compare_two_settings("meta-llama/Llama-2-7b-hf", [],
compare_two_settings(os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-hf"), [],
["--cpu-offload-gb", "4"])
......@@ -15,9 +15,12 @@ from vllm.core.scheduler import (ARTIFICIAL_PREEMPTION_MAX_CNT,
from ..models.utils import check_outputs_equal
from ..utils import models_path_prefix
import os
MODELS = [
"facebook/opt-125m",
]
os.path.join(models_path_prefix, "facebook/opt-125m"),
]
@pytest.fixture(scope="module", autouse=True)
......
......@@ -6,64 +6,66 @@ from tests.quantization.utils import is_quant_method_supported
from vllm import LLM, SamplingParams
from vllm.plugins import set_torch_compile_backend
from vllm.utils import is_hip
import os
from ..utils import models_path_prefix
TEST_MODELS_SMOKE = [
("nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples", {
(os.path.join(models_path_prefix, "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples"), {
"quantization": "compressed-tensors"
}),
("meta-llama/Meta-Llama-3-8B", {}),
(os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B"), {}),
]
TEST_MODELS = [
("facebook/opt-125m", {}),
("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", {
(os.path.join(models_path_prefix, "facebook/opt-125m"), {}),
(os.path.join(models_path_prefix, "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change"), {
"dtype": torch.float16,
"quantization": "compressed-tensors"
}),
("neuralmagic/Meta-Llama-3-8B-Instruct-FP8", {
(os.path.join(models_path_prefix, "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"), {
"dtype": torch.float16,
"quantization": "fp8"
}),
("nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples", {
(os.path.join(models_path_prefix, "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples"), {
"quantization": "compressed-tensors"
}),
("meta-llama/Meta-Llama-3-8B", {}),
(os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B"), {}),
]
# TODO: enable in pytorch 2.5
if False and is_quant_method_supported("aqlm"): # noqa: SIM223
TEST_MODELS.append(("ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", {
TEST_MODELS.append((os.path.join(models_path_prefix, "ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf"), {
"quantization": "aqlm"
}))
# TODO: enable in pytorch 2.5
if False and is_quant_method_supported("gguf"): # noqa: SIM223
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", {
TEST_MODELS.append((os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"), {
"quantization": "gguf"
}))
if is_quant_method_supported("gptq"):
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", {
TEST_MODELS.append((os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ"), {
"quantization": "gptq"
}))
if is_quant_method_supported("gptq_marlin"):
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", {
TEST_MODELS.append((os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"), {
"quantization": "gptq_marlin"
}))
if is_quant_method_supported("gptq_marlin_24"):
TEST_MODELS.append(("alexm-nm/tinyllama-24-marlin24-4bit-g128", {
TEST_MODELS.append((os.path.join(models_path_prefix, "alexm-nm/tinyllama-24-marlin24-4bit-g128"), {
"quantization": "gptq_marlin_24"
}))
if is_quant_method_supported("marlin"):
TEST_MODELS.append(("robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin", {
TEST_MODELS.append((os.path.join(models_path_prefix, "robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin"), {
"quantization": "marlin"
}))
if not is_hip() and is_quant_method_supported("awq"):
TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", {
TEST_MODELS.append((os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ"), {
"quantization": "AWQ"
}))
......
......@@ -37,6 +37,7 @@ from vllm.logger import init_logger
from vllm.outputs import RequestOutput
from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, cuda_device_count_stateless,
identity, is_cpu)
from utils import models_path_prefix
logger = init_logger(__name__)
......@@ -875,8 +876,9 @@ def num_gpus_available():
return cuda_device_count_stateless()
temp_dir = tempfile.gettempdir()
_dummy_path = os.path.join(temp_dir, "dummy_opt")
# temp_dir = tempfile.gettempdir()
# _dummy_path = os.path.join(temp_dir, "dummy_opt")
_dummy_path = os.path.join(models_path_prefix, "facebook/opt-125m")
@pytest.fixture
......
......@@ -5,13 +5,15 @@ import pytest
from vllm import SamplingParams
from .conftest import get_token_ids_from_llm_generator
import os
from ....utils import models_path_prefix
@pytest.mark.parametrize(
"common_llm_kwargs",
[{
# Use a small model for a fast test.
"model": "facebook/opt-125m",
"model": os.path.join(models_path_prefix, "facebook/opt-125m"),
# skip cuda graph creation for fast test.
"enforce_eager": True,
......@@ -89,7 +91,7 @@ def test_v1_v2_greedy_equality_with_preemption(baseline_llm_generator,
"common_llm_kwargs",
[{
# Use a small model for a fast test.
"model": "facebook/opt-125m",
"model": os.path.join(models_path_prefix, "facebook/opt-125m"),
# skip cuda graph creation for fast test.
"enforce_eager": True,
......@@ -156,7 +158,7 @@ def test_v1_v2_greedy_equality_with_cow(baseline_llm_generator,
"common_llm_kwargs",
[{
# Use a small model for a fast test.
"model": "facebook/opt-125m",
"model": os.path.join(models_path_prefix, "facebook/opt-125m"),
# Our prompts will generate 128 tokens; since the prompts themselves are
# small, we don't need much KV space beyond 128.
......@@ -256,7 +258,7 @@ def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator,
[
{
# Use a small model for a fast test.
"model": "facebook/opt-125m",
"model": os.path.join(models_path_prefix, "facebook/opt-125m"),
# skip cuda graph creation for fast test.
"enforce_eager": True,
......@@ -337,7 +339,7 @@ def test_chunked_prefill_block_manager_v2(baseline_llm_generator,
"common_llm_kwargs",
[{
# Use a small model for a fast test.
"model": "facebook/opt-125m",
"model": os.path.join(models_path_prefix, "facebook/opt-125m"),
# skip cuda graph creation for fast test.
"enforce_eager": True,
......@@ -418,7 +420,7 @@ def test_v1_v2_greedy_equality_prefix_caching_enabled_with_preemption(
"common_llm_kwargs",
[{
# Use a small model for a fast test.
"model": "facebook/opt-125m",
"model": os.path.join(models_path_prefix, "facebook/opt-125m"),
# skip cuda graph creation for fast test.
"enforce_eager": True,
......@@ -495,7 +497,7 @@ def test_auto_prefix_caching_with_preemption(baseline_llm_generator,
"common_llm_kwargs",
[{
# Use a small model for a fast test.
"model": "facebook/opt-125m",
"model": os.path.join(models_path_prefix, "facebook/opt-125m"),
# skip cuda graph creation for fast test.
"enforce_eager": True,
......
......@@ -2,13 +2,15 @@ import random
from typing import List
import pytest
import os
from vllm import LLM, SamplingParams
from .conftest import get_text_from_llm_generator
from ...utils import models_path_prefix
# relatively small model with 4k sliding window
MODEL = "bigcode/starcoder2-3b"
MODEL = os.path.join(models_path_prefix, "bigcode/starcoder2-3b")
BLOCK_SIZE = 16
......
......@@ -13,7 +13,7 @@ from transformers import __version__ as transformers_version
from vllm.logger import init_logger
from ..utils import compare_two_settings, fork_new_process_for_each_test
from ..utils import compare_two_settings, fork_new_process_for_each_test, models_path_prefix
logger = init_logger("test_pipeline_parallel")
......@@ -24,22 +24,22 @@ VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
("TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, TRUST_REMOTE_CODE, "
"MODEL_NAME, DIST_BACKEND"),
[
(2, 2, 0, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
(2, 2, 1, 0, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
(1, 3, 0, 0, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
(1, 4, 0, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
(1, 4, 1, 0, 0, "meta-llama/Meta-Llama-3-8B", "mp"),
(1, 3, 0, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
(1, 4, 0, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
(1, 4, 1, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
(2, 2, 1, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
(2, 2, 0, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray"),
(2, 2, 0, 1, 0, os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B"), "mp"),
(2, 2, 1, 0, 0, os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B"), "mp"),
(1, 3, 0, 0, 0, os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B"), "mp"),
(1, 4, 0, 1, 0, os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B"), "mp"),
(1, 4, 1, 0, 0, os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B"), "mp"),
(1, 3, 0, 0, 0, os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B"), "ray"),
(1, 4, 0, 1, 0, os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B"), "ray"),
(1, 4, 1, 0, 0, os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B"), "ray"),
(2, 2, 1, 0, 0, os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B"), "ray"),
(2, 2, 0, 1, 0, os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B"), "ray"),
# NOTE: InternVL2 multi-node tests are flaky,
# use mp backend to skip the multi-node tests
(1, 2, 1, 1, 1, "OpenGVLab/InternVL2-1B", "mp"),
(1, 2, 1, 1, 1, "OpenGVLab/InternVL2-2B", "mp"),
(1, 2, 1, 0, 1, "OpenGVLab/InternVL2-4B", "mp"),
(1, 2, 0, 1, 0, "Qwen/Qwen2-VL-2B-Instruct", "mp")
(1, 2, 1, 1, 1, os.path.join(models_path_prefix, "OpenGVLab/InternVL2-1B"), "mp"),
(1, 2, 1, 1, 1, os.path.join(models_path_prefix, "OpenGVLab/InternVL2-2B"), "mp"),
(1, 2, 1, 0, 1, os.path.join(models_path_prefix, "OpenGVLab/InternVL2-4B"), "mp"),
(1, 2, 0, 1, 0, os.path.join(models_path_prefix, "Qwen/Qwen2-VL-2B-Instruct"), "mp")
],
)
@fork_new_process_for_each_test
......
......@@ -2,15 +2,15 @@ import os
import pytest
from ..utils import compare_two_settings, fork_new_process_for_each_test
from ..utils import compare_two_settings, fork_new_process_for_each_test, models_path_prefix
@pytest.mark.parametrize("PP_SIZE, MODEL_NAME", [
(2, "JackFram/llama-160m"),
(2, os.path.join(models_path_prefix, "JackFram/llama-160m")),
])
@pytest.mark.parametrize("ATTN_BACKEND", [
"FLASH_ATTN",
"FLASHINFER",
# "FLASHINFER",
])
@fork_new_process_for_each_test
def test_pp_cudagraph(PP_SIZE, MODEL_NAME, ATTN_BACKEND):
......
......@@ -5,6 +5,7 @@ Run `pytest tests/encoder_decoder/test_e2e_correctness.py`.
from typing import List, Optional, Tuple
import pytest
import os
from transformers import AutoModelForSeq2SeqLM
from vllm.sequence import SampleLogprobs
......@@ -12,6 +13,7 @@ from vllm.utils import is_cpu
from ..conftest import DecoderPromptType
from ..models.utils import check_logprobs_close
from ..utils import models_path_prefix
def vllm_to_hf_output(
......@@ -28,7 +30,7 @@ def vllm_to_hf_output(
return output_ids, hf_output_str, out_logprobs
@pytest.mark.parametrize("model", ["facebook/bart-large-cnn"])
@pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "facebook/bart-large-cnn")])
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [5])
......
......@@ -3,9 +3,11 @@ import pytest
from vllm.engine.arg_utils import EngineArgs
from vllm.engine.llm_engine import LLMEngine
from vllm.sampling_params import SamplingParams
import os
from ..utils import models_path_prefix
@pytest.mark.parametrize("model", ["facebook/opt-125m"])
@pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "facebook/opt-125m")])
@pytest.mark.parametrize("block_size", [16])
def test_computed_prefix_blocks(model: str, block_size: int):
# This test checks if we are able to run the engine to completion
......
......@@ -8,6 +8,8 @@ from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.engine.llm_engine import LLMEngine
from vllm.executor.gpu_executor import GPUExecutor, GPUExecutorAsync
from vllm.sampling_params import SamplingParams
import os
from ..utils import models_path_prefix
class Mock:
......@@ -31,7 +33,7 @@ class CustomGPUExecutorAsync(GPUExecutorAsync):
return await super().execute_model_async(*args, **kwargs)
@pytest.mark.parametrize("model", ["facebook/opt-125m"])
@pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "facebook/opt-125m")])
def test_custom_executor_type_checking(model):
with pytest.raises(ValueError):
engine_args = EngineArgs(model=model,
......@@ -47,7 +49,7 @@ def test_custom_executor_type_checking(model):
AsyncLLMEngine.from_engine_args(engine_args)
@pytest.mark.parametrize("model", ["facebook/opt-125m"])
@pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "facebook/opt-125m")])
def test_custom_executor(model, tmpdir):
cwd = os.path.abspath(".")
os.chdir(tmpdir)
......@@ -67,7 +69,7 @@ def test_custom_executor(model, tmpdir):
os.chdir(cwd)
@pytest.mark.parametrize("model", ["facebook/opt-125m"])
@pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "facebook/opt-125m")])
def test_custom_executor_async(model, tmpdir):
cwd = os.path.abspath(".")
os.chdir(tmpdir)
......
......@@ -2,9 +2,11 @@ import pytest
from vllm.entrypoints.llm import LLM
from vllm.sampling_params import SamplingParams
import os
from ..utils import models_path_prefix
@pytest.mark.parametrize("model", ["facebook/opt-125m"])
@pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "facebook/opt-125m")])
def test_computed_prefix_blocks(model: str):
# This test checks if the engine generates completions both with and
# without optional detokenization, that detokenization includes text
......
......@@ -2,9 +2,11 @@ import pytest
from vllm.entrypoints.llm import LLM
from vllm.sampling_params import SamplingParams
import os
from ..utils import models_path_prefix
@pytest.mark.parametrize("model", ["facebook/opt-125m"])
@pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "facebook/opt-125m")])
def test_skip_tokenizer_initialization(model: str):
# This test checks if the flag skip_tokenizer_init skips the initialization
# of tokenizer and detokenizer. The generated output is expected to contain
......
......@@ -7,11 +7,13 @@ Run `pytest tests/engine/test_stop_reason.py`.
"""
import pytest
import os
import transformers
from vllm import SamplingParams
from ..utils import models_path_prefix
MODEL = "facebook/opt-350m"
MODEL = os.path.join(models_path_prefix, "facebook/opt-350m")
STOP_STR = "."
SEED = 42
MAX_TOKENS = 1024
......
from typing import Any, List, Optional
import pytest
import os
from vllm import CompletionOutput, LLMEngine, SamplingParams
from ..utils import models_path_prefix
MODEL = "meta-llama/llama-2-7b-hf"
MODEL = os.path.join(models_path_prefix, "meta-llama/llama-2-7b-hf")
MAX_TOKENS = 200
IS_ASYNC = False
......
......@@ -2,12 +2,14 @@ import weakref
from typing import List
import pytest
import os
from vllm import LLM, EmbeddingRequestOutput, PoolingParams
from ...conftest import cleanup
from ...utils import models_path_prefix
MODEL_NAME = "intfloat/e5-mistral-7b-instruct"
MODEL_NAME = os.path.join(models_path_prefix, "intfloat/e5-mistral-7b-instruct")
PROMPTS = [
"Hello, my name is",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment